#!/usr/bin/perl -w #$Id: matrix_comparison.pl,v 1.80 2006/11/22 20:54:51 amonida Exp amonida $# use strict; use warnings; use Archive::Tar; use Data::Dumper; use File::Spec; use Getopt::Std; use IO::File; use List::Util qw(shuffle sum); use constant INVLOGTWO => 1 / log(2); my %opts = ( 'o' => '>-' ); getopts('d:m:o:p:c:s:h', \%opts); if (exists $opts{'h'}) { usage(); exit 1; } my $matrix_file = $opts{'d'}; # Database weight matrices my $meme_file = $opts{'m'}; # MEME matrices in text format my $output = $opts{'o'}; # Output file my $parse_out = $opts{'s'}; # Parsed output for statistical analysis my $threshold = $opts{'c'}/100; # Conservation threshold for matrix comparison my $shuffling = $opts{'p'}; # Do the shuffle! my $do_shuffle = defined($shuffling); if ($do_shuffle) { my $devnull = File::Spec->devnull(); # Setting parsed output file and output file to $devnull when # shuffling $parse_out = $devnull; $output = $devnull; } if ( !defined($matrix_file) || !defined($meme_file) ) { usage(); exit 1; } # Writing header to the output file. my $out_fh = IO::File->new(">$output") or die("can't open $output!\n$!"); print $out_fh "#" x 80, "\n"; print $out_fh "\t\t############ Matrix comparison V1.0 ############\n"; print $out_fh "Input files:\n"; print $out_fh "## Motif file (e.g. MEME): " . $meme_file . "\n"; print $out_fh "## Database file: " . $matrix_file . "\n"; print $out_fh "## Level of conservation: " . $threshold * 100 . "%\n"; print $out_fh "#" x 80, "\n"; my $parse_fh = IO::File->new(">$parse_out") or die("can't open $parse_out!\n$!"); my %alldirs; # All produced files (when run with shuffling) # Change pseudocount here my $pseudocount = 0.0001; # Reading the database matrices my $db_matrices = read_database_matrix($matrix_file); # Run the programme as many times as defined by user, permuting the # matrices each time (-s option). If "-s" is not defined, run the # programme only once WITHOUT permuting the matrices. for ( my $iter = 0 ; $iter < ( $do_shuffle ? $shuffling : 1 ) ; ++$iter ) { # Reading the MEME matrices my $meme_matrices = read_meme_matrix($meme_file); foreach my $matrix ( values %{ $meme_matrices } ) { foreach my $db_matrix ( values %{ $db_matrices } ) { my $db_len = $db_matrix->{LENGTH}; my $db_desc = $db_matrix->{DESC}; my @VOS = (); my @starts = (); my @stops = (); foreach my $meme_matrix (values %{ $matrix->{MATRICES} }) { ####################################################### # For shuffling the matrices if ($do_shuffle) { $meme_matrix = [ shuffle( @{$meme_matrix} ) ]; } ####################################################### my ( $values, $overlaps, $starts_stops ) = overlap_loop( $meme_matrix, $db_matrix->{MATRIX}, $matrix->{LENGTH}, $db_matrix->{LENGTH} ); push ( @{ $VOS[0] }, @{ $values } ) if ( defined( @{ $values } ) ); push (@{ $VOS[1] }, @{ $overlaps }); push (@{ $VOS[2] }, @{ $starts_stops }); } foreach my $start_stop ( @{ $VOS[2] } ) { push @starts, [ @{ $start_stop->[0] } ]; push @stops, [ @{ $start_stop->[1] } ]; } $matrix->{MATCHES}{$db_len}{ $db_matrix->{NAME} } = { VALUES => \@{ $VOS[0] }, OVERLAPS => \@{ $VOS[1] }, STARTS => \@starts, STOPS => \@stops, DESC => $db_desc, LENGTH => $db_len, IC_ARRAY => $db_matrix->{IC_ARRAY} }; } } foreach my $meme_matrix ( values %{$meme_matrices} ) { my $meme_len = $meme_matrix->{LENGTH}; foreach my $match_len ( keys %{ $meme_matrix->{MATCHES} } ) { my $min_element; foreach my $db_match_name ( keys %{ $meme_matrix->{MATCHES}{$match_len} } ) { my $db_match = $meme_matrix->{MATCHES}{$match_len}{$db_match_name}; if ( !defined( $db_match->{NAME} ) ) { $db_match->{NAME} = $db_match_name; } foreach my $val ( 0 .. $#{ $db_match->{VALUES} } ) { my $element = $db_match->{VALUES}[$val]; my $overlap = $db_match->{OVERLAPS}[$val]; my $start = $db_match->{STARTS}[$val]; my $stop = $db_match->{STOPS}[$val]; if ( !defined($min_element) || $element < $min_element ) { $min_element = $element; $db_match->{BEST_SCORE} = $min_element; $db_match->{SSTART} = 1 + $start->[0]; $db_match->{SSTOP} = 1 + $stop->[0]; $db_match->{LSTART} = 1 + $start->[1]; $db_match->{LSTOP} = 1 + $stop->[1]; # Save best overall match for each length # separately. my $best_score_for_length = $meme_matrix->{BEST_MATCHES}{$match_len} {BEST_SCORE}; if ( !defined($best_score_for_length) || $best_score_for_length > $min_element ) { $meme_matrix->{BEST_MATCHES} {$match_len} = $db_match; } } } } } } foreach my $meme_matrix_name ( sort keys %{$meme_matrices} ) { my $meme_matrix = $meme_matrices->{$meme_matrix_name}; print $out_fh "$meme_matrix->{NAME} $meme_matrix->{LENGTH}\n"; print $parse_fh "$meme_matrix->{NAME} $meme_matrix->{LENGTH}\n"; foreach my $match_len ( sort keys %{ $meme_matrix->{BEST_MATCHES} } ) { my $best_db_match = $meme_matrix->{BEST_MATCHES}{$match_len}; next if ( !defined $best_db_match ); printf $parse_fh "%2d %.4f\n", $best_db_match->{LENGTH}, $best_db_match->{BEST_SCORE}; if ( defined( $best_db_match->{DESC} ) ) { printf $out_fh "%22s %20s (%2d) %.4f " . "[%2d -> %2d] - [%2d -> %2d]\n", $best_db_match->{NAME}, $best_db_match->{DESC}, $best_db_match->{LENGTH}, $best_db_match->{BEST_SCORE}, $best_db_match->{SSTART}, $best_db_match->{SSTOP}, $best_db_match->{LSTART}, $best_db_match->{LSTOP}; } else { printf $out_fh "%22s (%2d) %.4f " . "[%2d -> %2d] - [%2d -> %2d]\n", $best_db_match->{NAME}, $best_db_match->{LENGTH}, $best_db_match->{BEST_SCORE}, $best_db_match->{SSTART}, $best_db_match->{SSTOP}, $best_db_match->{LSTART}, $best_db_match->{LSTOP}; } if ($do_shuffle) { # Creating separate directories for each database matrix # length. Best matches will be saved in the appropriate # directory. my $dir = $best_db_match->{LENGTH}; if ( !-d $dir ) { mkdir($dir) or die "$!"; } my $perm_file = File::Spec->catfile( $dir, $meme_matrix->{NAME} ); $alldirs{$dir}{ $meme_matrix->{NAME} } = $perm_file; # Appending the permutation results to each # appropriate file, i.e. the database matrix length. my $perm_fh = IO::File->new( ">>$perm_file" ) or die( "can't open $perm_file!\n$!" ); printf $perm_fh "%.4f\n", $best_db_match->{BEST_SCORE}; $perm_fh->close(); } } } # print (Dumper $meme_matrices); } $parse_fh->close(); $out_fh->close(); # Compressing the directories into a tar file and removing the # directories afterwards. if ($do_shuffle) { my $tar = Archive::Tar->new(); foreach my $dir ( keys %alldirs ) { # Add file to the archive, remove it drom disk. $tar->add_files( values %{ $alldirs{$dir} } ); unlink values %{ $alldirs{$dir} }; # Remove the empty directory. rmdir($dir) or warn("Failed to rmdir($dir): $!"); } if ( !$tar->write( 'permutations.tgz', 9 ) ) { warn "Failed writing compressed tar file\n"; warn "Trying again, without compression\n"; $tar->write( 'permutations.tar' ); } } # Read MEME matrices sub read_meme_matrix { my $file = shift; my ($matrix_name, $matrix_desc); my ($curr_matrix, $reverse_matrix); my %hash_of_memes; my $meme_fh = IO::File->new("<$file") or die("can't open $file!\n$!"); while ( defined( my $line = $meme_fh->getline() ) ) { next if ( $line =~ /^\s*$/ ); chomp $line; if ( $line =~ /^(MA\d+)\s(\S+)/ || $line =~ /^V\$(\S*)(?:\s*(\S*))/ ) { $matrix_name = $1; $hash_of_memes{$matrix_name}{NAME} = $matrix_name; if ( defined($2) ) { $matrix_desc = $2; $hash_of_memes{$matrix_name}{DESC} = $matrix_desc; } # Initialise empty matrix $hash_of_memes{$matrix_name}{MATRICES}{MATRIX} = []; $hash_of_memes{$matrix_name}{MATRICES}{REVERSE_MATRIX} = []; # Have a temporary matrix $curr_matrix = $hash_of_memes{$matrix_name}{MATRICES}{MATRIX}; $reverse_matrix = $hash_of_memes{$matrix_name}{MATRICES}{REVERSE_MATRIX}; } elsif ( $line =~ /^[ACGT]\s+\[\s*(.*)\s*\]/ ) { # Read in the matrix, compatible with MEME # Adding pseudocount to the matrix elements my @row = map { $_ += $pseudocount } split( /\s+/, $1 ); my @reverse_row = reverse( @row ); $hash_of_memes{$matrix_name}{LENGTH} = scalar @row; for ( my $elem = 0 ; $elem < scalar @row ; ++$elem ) { # Push the current element in the MATRIX-field of the # hash key. push( @{ $curr_matrix->[$elem] }, $row[$elem] ); push( @{ $reverse_matrix->[$elem] }, $reverse_row[$elem] ); } } if ( $line =~ /^(MOTIF|MATRIX)\s*(\d+)/ ) { $matrix_name = $1 . "_" . $2; } elsif ( $line =~ /^letter-probability matrix:/ ) { # Taking the letter probability matrices while ( defined( my $matrix_line = $meme_fh->getline() ) ) { last if ( $matrix_line =~ /^-+$/ ); last if ( $matrix_line =~ /^\s*$/ ); chomp $matrix_line; $matrix_line =~ s/^\s+//; $matrix_line =~ s/\s+$//; # Adding pseudocount to the matrix elements my @row = map { $_ += $pseudocount } split( /\s+/, $matrix_line ); # Complementary row my @reverse_row = reverse @row; push( @{ $hash_of_memes{$matrix_name}{MATRICES}{MATRIX} }, \@row ); # Pushing the rows on the sense/antisense matrices unshift( @{ $hash_of_memes{$matrix_name}{MATRICES} {REVERSE_MATRIX} }, \@reverse_row ); } $hash_of_memes{$matrix_name}{LENGTH} = scalar @{ $hash_of_memes{$matrix_name}{MATRICES}{MATRIX} }; $hash_of_memes{$matrix_name}{NAME} = $matrix_name; } } $meme_fh->close(); my $IC; foreach my $motif ( values %hash_of_memes ) { foreach my $matrix ( values %{ $motif->{MATRICES} } ) { my $total_IC; # Normalize the rows foreach my $row ( @{$matrix} ) { my $row_sum = sum( @{$row} ); map { $_ /= $row_sum } @{$row}; } if ( !( defined( $motif->{IC_ARRAY} ) ) ) { foreach my $row ( @{$matrix} ) { my $ic = 0; foreach my $elem ( @{ $row } ) { $ic += $elem * log2($elem); } $IC = 2 + $ic; push ( @{ $motif->{IC_ARRAY} }, $IC ); $total_IC += $IC } } } } # print Dumper(\%hash_of_memes); return \%hash_of_memes; } # Read database matrices sub read_database_matrix { my $file = shift; my ( $matrix_name, $matrix_desc ); my %hash_of_dbtfs; my $curr_matrix; my %hash_of_memes; my @sorted_hash; my $database_fh = IO::File->new("<$file") or die("can't open $file!\n$!"); while ( defined( my $line = $database_fh->getline() ) ) { next if ( $line =~ /^\s*$/ ); chomp $line; if ( $line =~ /^(MA\d+)\s(\S+)/ || $line =~ /^V\$(\S*)(?:\s*(\S*))/ ) { $matrix_name = $1; $hash_of_dbtfs{$matrix_name}{NAME} = $matrix_name; if ( defined($2) ) { $matrix_desc = $2; $hash_of_dbtfs{$matrix_name}{DESC} = $matrix_desc; } # Initialise empty matrix $hash_of_dbtfs{$matrix_name}{MATRIX} = []; # Have a temporary matrix $curr_matrix = $hash_of_dbtfs{$matrix_name}{MATRIX}; } elsif ( $line =~ /^[ACGT]\s+\[\s*(.*)\s*\]/ ) { # Read in the matrix, compatible with MEME # Adding pseudocount to the matrix elements my @row = map { $_ += $pseudocount } split( /\s+/, $1 ); $hash_of_dbtfs{$matrix_name}{LENGTH} = scalar @row; for ( my $elem = 0 ; $elem < scalar @row ; ++$elem ) { # Push the current element in the MATRIX-field of the # hash key. push( @{ $curr_matrix->[$elem] }, $row[$elem] ); } } elsif ( $line =~ /^(MOTIF|MATRIX)\s*(\d+)/ ) { $matrix_name = $1 . "_" . $2; } elsif ( $line =~ /^letter-probability matrix:/ ) { # Taking the letter probability matrices while ( defined( my $matrix_line = $database_fh->getline() ) ) { last if ( $matrix_line =~ /^-+$/ ); last if ( $matrix_line =~ /^\s*$/ ); chomp $matrix_line; $matrix_line =~ s/^\s+//; $matrix_line =~ s/\s+$//; # Adding pseudocount to the matrix elements my @row = map { $_ += $pseudocount } split( /\s+/, $matrix_line ); push( @{ $hash_of_dbtfs{$matrix_name}{MATRIX} }, \@row ); $hash_of_dbtfs{$matrix_name}{LENGTH} = scalar @{ $hash_of_dbtfs{$matrix_name}{MATRIX} }; $hash_of_dbtfs{$matrix_name}{NAME} = $matrix_name; } } } $database_fh->close(); # Normalize the rows foreach my $tf ( values %hash_of_dbtfs ) { my $matrix = $tf->{MATRIX}; foreach my $row ( @{$matrix} ) { my $row_sum = sum( @{$row} ); map { $_ /= $row_sum } @{$row}; } } my $IC; foreach my $tf ( values %hash_of_dbtfs ) { my $total_IC; my $matrix = $tf->{MATRIX}; foreach my $row ( @{$matrix} ) { my $ic= 0; foreach my $elem ( @{ $row } ) { $ic += $elem * log2($elem); } $IC = 2 + $ic; push ( @{ $tf->{IC_ARRAY} }, $IC ); $total_IC += $IC; } } # print Dumper(\%hash_of_dbtfs); return \%hash_of_dbtfs; } ############################################################## # Subroutines # Calculate log_2 sub log2 { return INVLOGTWO * log(shift); } # Calculate the mutual information for the two matirces sub double_sum { my $s_mat = shift; my $l_mat = shift; my $s_start = shift; my $s_stop = shift; my $l_start = shift; my $l_stop = shift; my $sum = 0; my ( $IC_short, $IC_long ) = (0, 0); my $count = 0; # The outer sum over the current overlap. Overlap = length of # the shortest motif. for ( my ( $s_pos, $l_pos ) = ( $s_start, $l_start ) ; $s_pos <= $s_stop ; ++$s_pos, ++$l_pos ) { # Keeping track on the length of the current # overlap. This is later used for calculating information # content if the overlap was completely conserved. $count++; # The inner sum: over all 4 bases in each matrix. Length # is the length of overlap between the two matrices. ########################################################### # Relative entropy between matrices my $s = $s_mat->[$s_pos]; my $l = $l_mat->[$l_pos]; my $s_ic = 0; my $l_ic = 0; for ( my $elem = 0 ; $elem < 4 ; ++$elem ) { $s_ic -= $s->[$elem] * log2( $s->[$elem] ); $l_ic -= $l->[$elem] * log2( $l->[$elem] ); $sum += $s->[$elem] * log2( $s->[$elem] / $l->[$elem] ); } $IC_short += 2 - $s_ic; $IC_long += 2 - $l_ic; } # The total amount IC is 2 bits times the length of the # current overlap. If the actual IC for any of the comparing # partial matrices is less than %10 of the total IC, then # the comparison is excluded. The threshold has been set # arbitrarily but can be changed to a user defined threshold. my $conserved_overlap = 2 * $count; if ( ($IC_short/$conserved_overlap) > $threshold || ($IC_long/$conserved_overlap) > $threshold ) { return $sum; } # Return undef for all the calculations that do not pass the # above criteria. return undef; } sub overlap_loop { my $meme_matrix = shift; my $db_matrix = shift; my $meme_length = shift; my $db_length = shift; my ( @values, @overlaps, @starts_stops ); my ( $short, $long ); my ( $short_matrix, $long_matrix ); if ( $meme_length < $db_length ) { $short = $meme_length; $long = $db_length; $short_matrix = $meme_matrix; $long_matrix = $db_matrix; } else { $short = $db_length; $long = $meme_length; $short_matrix = $db_matrix; $long_matrix = $meme_matrix; } for ( my $offset = 1 - $short ; $offset < $long ; ++$offset ) { my ( $l_start, $l_stop, $s_start, $s_stop ) = overlap_length( $short, $long, $offset ); my $overlap = abs( $s_start - $s_stop ) + 1; # Minimum length of the overlap = 6 bases if ( $overlap >= 6 ) { my $shortmat_longmat = double_sum( $short_matrix, $long_matrix, $s_start, $s_stop, $l_start, $l_stop ); my $longmat_shortmat = double_sum( $long_matrix, $short_matrix, $l_start, $l_stop, $s_start, $s_stop ); next if ( ( !defined($shortmat_longmat) ) || ( !defined($longmat_shortmat) ) ); # Take mean of the two calculated scores my $mean = 0.5 * ( ($shortmat_longmat + $longmat_shortmat) / $overlap ); # Save the score (mean) and the length of overlaps # and the positions at which the mean score was # calculated at. push @values, $mean; push @overlaps, $overlap; push @starts_stops, [ [ $s_start, $l_start ], [ $s_stop, $l_stop ] ]; } } return ( \@values, \@overlaps, \@starts_stops ); } # Calculate the start and stop positions for the current overlap sub overlap_length { my $s = shift; my $l = shift; my $offset = shift; my $l_start = ( $offset > 0 ) ? $offset : 0; my $l_stop = ( $offset + $s < $l ) ? ( $offset + $s - 1 ) : $l - 1; my $s_start = ( $offset < 0 ) ? $s - ( $l_stop - $l_start + 1 ) : 0; my $s_stop = ( $offset < 0 ) ? $s - 1 : $l_stop - $l_start; return ( $l_start, $l_stop, $s_start, $s_stop ); } # Usage sub usage { print STDERR <