#!/usr/bin/perl -w

#$Id: matrix_comparison.pl,v 1.80 2006/11/22 20:54:51 amonida Exp amonida $#

use strict;
use warnings;

use Archive::Tar;
use Data::Dumper;
use File::Spec;
use Getopt::Std;
use IO::File;
use List::Util qw(shuffle sum);

use constant INVLOGTWO => 1 / log(2);

my %opts = ( 'o' => '>-' );
getopts('d:m:o:p:c:s:h', \%opts);

if (exists $opts{'h'}) {
    usage();
    exit 1;
}

my $matrix_file = $opts{'d'};   # Database weight matrices
my $meme_file   = $opts{'m'};   # MEME matrices in text format
my $output      = $opts{'o'};   # Output file
my $parse_out   = $opts{'s'};   # Parsed output for statistical analysis
my $threshold	= $opts{'c'}/100;
    # Conservation threshold for matrix comparison
my $shuffling   = $opts{'p'};   # Do the shuffle!

my $do_shuffle = defined($shuffling);

if ($do_shuffle) {
    my $devnull = File::Spec->devnull();

    # Setting parsed output file and output file to $devnull when
    # shuffling
    $parse_out = $devnull;
    $output    = $devnull;
}

if ( !defined($matrix_file) || !defined($meme_file) ) {
    usage();
    exit 1;
}

# Writing header to the output file.
my $out_fh = IO::File->new(">$output")
  or die("can't open $output!\n$!");
print $out_fh "#" x 80, "\n";
print $out_fh "\t\t############ Matrix comparison V1.0 ############\n";
print $out_fh "Input files:\n";
print $out_fh "## Motif file (e.g. MEME): " . $meme_file . "\n";
print $out_fh "## Database file: " . $matrix_file . "\n";
print $out_fh "## Level of conservation: " . $threshold * 100 . "%\n";
print $out_fh "#" x 80, "\n";

my $parse_fh = IO::File->new(">$parse_out")
    or die("can't open $parse_out!\n$!");

my %alldirs;    # All produced files (when run with shuffling)

# Change pseudocount here
my $pseudocount = 0.0001;

# Reading the database matrices
my $db_matrices = read_database_matrix($matrix_file);

# Run the programme as many times as defined by user, permuting the
# matrices each time (-s option). If "-s" is not defined, run the
# programme only once WITHOUT permuting the matrices.
for ( my $iter = 0 ;
    $iter < ( $do_shuffle ? $shuffling : 1 ) ; ++$iter )
{
    # Reading the MEME matrices
    my $meme_matrices = read_meme_matrix($meme_file);

    foreach my $matrix ( values %{ $meme_matrices } ) {
	foreach my $db_matrix ( values %{ $db_matrices } ) {
            my $db_len  = $db_matrix->{LENGTH};
            my $db_desc = $db_matrix->{DESC};
            my @VOS     = ();
            my @starts  = ();
            my @stops   = ();

	    foreach my $meme_matrix (values %{ $matrix->{MATRICES} }) {
                #######################################################
                # For shuffling the matrices
                if ($do_shuffle) {
                    $meme_matrix = [ shuffle( @{$meme_matrix} ) ];
                }
                #######################################################

                my ( $values, $overlaps, $starts_stops ) = overlap_loop(
                    $meme_matrix,        $db_matrix->{MATRIX},
                    $matrix->{LENGTH},   $db_matrix->{LENGTH}
                );

		push ( @{ $VOS[0] }, @{ $values } )
		    if ( defined( @{ $values } ) );
		push (@{ $VOS[1] }, @{ $overlaps });
		push (@{ $VOS[2] }, @{ $starts_stops });
	    }

            foreach my $start_stop ( @{ $VOS[2] } ) {
                push @starts, [ @{ $start_stop->[0] } ];
                push @stops,  [ @{ $start_stop->[1] } ];
            }

            $matrix->{MATCHES}{$db_len}{ $db_matrix->{NAME} } = {
                VALUES   => \@{ $VOS[0] },
                OVERLAPS => \@{ $VOS[1] },
                STARTS   => \@starts,
                STOPS    => \@stops,
                DESC     => $db_desc,
                LENGTH   => $db_len,
		IC_ARRAY => $db_matrix->{IC_ARRAY}
            };
	}
    }

    foreach my $meme_matrix ( values %{$meme_matrices} ) {
        my $meme_len = $meme_matrix->{LENGTH};

        foreach my $match_len ( keys %{ $meme_matrix->{MATCHES} } ) {
            my $min_element;

            foreach my $db_match_name (
                keys %{ $meme_matrix->{MATCHES}{$match_len} } )
            {

                my $db_match =
                  $meme_matrix->{MATCHES}{$match_len}{$db_match_name};

                if ( !defined( $db_match->{NAME} ) ) {
                    $db_match->{NAME} = $db_match_name;
                }

                foreach my $val ( 0 .. $#{ $db_match->{VALUES} } ) {
                    my $element = $db_match->{VALUES}[$val];
                    my $overlap = $db_match->{OVERLAPS}[$val];
                    my $start   = $db_match->{STARTS}[$val];
                    my $stop    = $db_match->{STOPS}[$val];

		    if ( !defined($min_element) ||

			$element < $min_element )
		    {
			$min_element = $element;

			$db_match->{BEST_SCORE} = $min_element;
			$db_match->{SSTART}     = 1 + $start->[0];
			$db_match->{SSTOP}      = 1 + $stop->[0];
			$db_match->{LSTART}     = 1 + $start->[1];
			$db_match->{LSTOP}      = 1 + $stop->[1];

			# Save best overall match for each length
			# separately.
			my $best_score_for_length =
			  $meme_matrix->{BEST_MATCHES}{$match_len}
			      {BEST_SCORE};

			if ( !defined($best_score_for_length) ||
			    $best_score_for_length >
			    $min_element )
			{
			    $meme_matrix->{BEST_MATCHES}
			      {$match_len} = $db_match;
			}
		    }
                }
            }
        }
    }

    foreach my $meme_matrix_name ( sort keys %{$meme_matrices} ) {
        my $meme_matrix = $meme_matrices->{$meme_matrix_name};

        print $out_fh "$meme_matrix->{NAME} $meme_matrix->{LENGTH}\n";
        print $parse_fh "$meme_matrix->{NAME} $meme_matrix->{LENGTH}\n";

        foreach
          my $match_len ( sort keys %{ $meme_matrix->{BEST_MATCHES} } )
        {
            my $best_db_match =
              $meme_matrix->{BEST_MATCHES}{$match_len};

            next if ( !defined $best_db_match );

            printf $parse_fh "%2d %.4f\n", $best_db_match->{LENGTH},
              $best_db_match->{BEST_SCORE};

            if ( defined( $best_db_match->{DESC} ) ) {
                printf $out_fh "%22s %20s (%2d) %.4f    "
                  . "[%2d -> %2d] - [%2d -> %2d]\n",
                  $best_db_match->{NAME}, $best_db_match->{DESC},
                  $best_db_match->{LENGTH},
                  $best_db_match->{BEST_SCORE},
                  $best_db_match->{SSTART}, $best_db_match->{SSTOP},
                  $best_db_match->{LSTART}, $best_db_match->{LSTOP};
            } else {
                printf $out_fh "%22s (%2d) %.4f    "
                  . "[%2d -> %2d] - [%2d -> %2d]\n",
                  $best_db_match->{NAME}, $best_db_match->{LENGTH},
                  $best_db_match->{BEST_SCORE},
                  $best_db_match->{SSTART}, $best_db_match->{SSTOP},
                  $best_db_match->{LSTART}, $best_db_match->{LSTOP};
            }

            if ($do_shuffle) {
                # Creating separate directories for each database matrix
                # length. Best matches will be saved in the appropriate
                # directory.
                my $dir = $best_db_match->{LENGTH};

                if ( !-d $dir ) { mkdir($dir) or die "$!"; }

                my $perm_file =
                  File::Spec->catfile( $dir, $meme_matrix->{NAME} );

                $alldirs{$dir}{ $meme_matrix->{NAME} } = $perm_file;

                # Appending the permutation results to each
                # appropriate file, i.e. the database matrix length.
                my $perm_fh = IO::File->new( ">>$perm_file" )
                  or die( "can't open $perm_file!\n$!" );

                printf $perm_fh "%.4f\n", $best_db_match->{BEST_SCORE};

                $perm_fh->close();
            }
        }
    }

#    print (Dumper $meme_matrices);
}
$parse_fh->close();
$out_fh->close();

# Compressing the directories into a tar file and removing the
# directories afterwards.
if ($do_shuffle) {
    my $tar = Archive::Tar->new();

    foreach my $dir ( keys %alldirs ) {

        # Add file to the archive, remove it drom disk.
        $tar->add_files( values %{ $alldirs{$dir} } );
        unlink values %{ $alldirs{$dir} };

        # Remove the empty directory.
        rmdir($dir) or warn("Failed to rmdir($dir): $!");
    }

    if ( !$tar->write( 'permutations.tgz', 9 ) ) {
        warn "Failed writing compressed tar file\n";
        warn "Trying again, without compression\n";
        $tar->write( 'permutations.tar' );
    }
}


# Read MEME matrices
sub read_meme_matrix
{
    my $file = shift;

    my ($matrix_name, $matrix_desc);
    my ($curr_matrix, $reverse_matrix);
    my %hash_of_memes;

    my $meme_fh = IO::File->new("<$file")
      or die("can't open $file!\n$!");

    while ( defined( my $line = $meme_fh->getline() ) ) {
        next if ( $line =~ /^\s*$/ );

        chomp $line;

        if (   $line =~ /^(MA\d+)\s(\S+)/
            || $line =~ /^V\$(\S*)(?:\s*(\S*))/ )
        {
            $matrix_name = $1;
            $hash_of_memes{$matrix_name}{NAME} = $matrix_name;

            if ( defined($2) ) {
                $matrix_desc = $2;
                $hash_of_memes{$matrix_name}{DESC} = $matrix_desc;
            }

            # Initialise empty matrix
            $hash_of_memes{$matrix_name}{MATRICES}{MATRIX} = [];
            $hash_of_memes{$matrix_name}{MATRICES}{REVERSE_MATRIX} = [];

            # Have a temporary matrix
            $curr_matrix =
		$hash_of_memes{$matrix_name}{MATRICES}{MATRIX};

	    $reverse_matrix =
		$hash_of_memes{$matrix_name}{MATRICES}{REVERSE_MATRIX};

        } elsif ( $line =~ /^[ACGT]\s+\[\s*(.*)\s*\]/ ) {
            # Read in the matrix, compatible with MEME

            # Adding pseudocount to the matrix elements
            my @row = map { $_ += $pseudocount } split( /\s+/, $1 );
	    my @reverse_row = reverse( @row );

            $hash_of_memes{$matrix_name}{LENGTH} = scalar @row;

            for ( my $elem = 0 ; $elem < scalar @row ; ++$elem ) {
                # Push the current element in the MATRIX-field of the
                # hash key.
                push( @{ $curr_matrix->[$elem] }, $row[$elem] );
		push( @{ $reverse_matrix->[$elem] }, $reverse_row[$elem] );
            }
        }

	if ( $line =~ /^(MOTIF|MATRIX)\s*(\d+)/ ) {
            $matrix_name = $1 . "_" . $2;
        } elsif ( $line =~ /^letter-probability matrix:/ ) {
            # Taking the letter probability matrices
            while ( defined( my $matrix_line = $meme_fh->getline() ) ) {
                last if ( $matrix_line =~ /^-+$/ );
                last if ( $matrix_line =~ /^\s*$/ );

                chomp $matrix_line;

                $matrix_line =~ s/^\s+//;
                $matrix_line =~ s/\s+$//;

                # Adding pseudocount to the matrix elements
                my @row =
                  map { $_ += $pseudocount }
                  split( /\s+/, $matrix_line );

                # Complementary row
                my @reverse_row = reverse @row;

                push(
                    @{
                        $hash_of_memes{$matrix_name}{MATRICES}{MATRIX}
                      },
                    \@row
                );

                # Pushing the rows on the sense/antisense matrices
                unshift(
                    @{
                        $hash_of_memes{$matrix_name}{MATRICES}
                          {REVERSE_MATRIX}
                      },
                    \@reverse_row
                );
            }

            $hash_of_memes{$matrix_name}{LENGTH} = scalar
              @{ $hash_of_memes{$matrix_name}{MATRICES}{MATRIX} };

            $hash_of_memes{$matrix_name}{NAME} = $matrix_name;
        }
    }

    $meme_fh->close();

    my $IC;
    foreach my $motif ( values %hash_of_memes ) {
	foreach my $matrix ( values %{ $motif->{MATRICES} } ) {
	    my $total_IC;

	    # Normalize the rows
	    foreach my $row ( @{$matrix} ) {
		my $row_sum = sum( @{$row} );
		map { $_ /= $row_sum } @{$row};
	    }

	    if ( !( defined( $motif->{IC_ARRAY} ) ) ) {
		foreach my $row ( @{$matrix} ) {
		    my $ic = 0;
		    foreach my $elem ( @{ $row } ) {
			$ic += $elem * log2($elem);
		    }
		    $IC = 2 + $ic;
		    push ( @{ $motif->{IC_ARRAY} }, $IC );
		    $total_IC += $IC
		}
	    }
	}
    }


#    print Dumper(\%hash_of_memes);
    return \%hash_of_memes;
}

# Read database matrices
sub read_database_matrix
{
    my $file = shift;

    my ( $matrix_name, $matrix_desc );
    my %hash_of_dbtfs;
    my $curr_matrix;

    my %hash_of_memes;
    my @sorted_hash;

    my $database_fh = IO::File->new("<$file")
      or die("can't open $file!\n$!");

    while ( defined( my $line = $database_fh->getline() ) ) {
        next if ( $line =~ /^\s*$/ );

        chomp $line;

        if ( $line =~ /^(MA\d+)\s(\S+)/ ||
	    $line =~ /^V\$(\S*)(?:\s*(\S*))/ )
        {
            $matrix_name = $1;
            $hash_of_dbtfs{$matrix_name}{NAME} = $matrix_name;

            if ( defined($2) ) {
                $matrix_desc = $2;
                $hash_of_dbtfs{$matrix_name}{DESC} = $matrix_desc;
            }

            # Initialise empty matrix
            $hash_of_dbtfs{$matrix_name}{MATRIX} = [];

            # Have a temporary matrix
            $curr_matrix = $hash_of_dbtfs{$matrix_name}{MATRIX};
        } elsif ( $line =~ /^[ACGT]\s+\[\s*(.*)\s*\]/ ) {
            # Read in the matrix, compatible with MEME

            # Adding pseudocount to the matrix elements
            my @row = map { $_ += $pseudocount } split( /\s+/, $1 );

            $hash_of_dbtfs{$matrix_name}{LENGTH} = scalar @row;

            for ( my $elem = 0 ; $elem < scalar @row ; ++$elem ) {
                # Push the current element in the MATRIX-field of the
                # hash key.
                push( @{ $curr_matrix->[$elem] }, $row[$elem] );

            }
        } elsif ( $line =~ /^(MOTIF|MATRIX)\s*(\d+)/ ) {
            $matrix_name = $1 . "_" . $2;
        } elsif ( $line =~ /^letter-probability matrix:/ ) {
            # Taking the letter probability matrices
            while (
                defined( my $matrix_line = $database_fh->getline() ) )
            {
                last if ( $matrix_line =~ /^-+$/ );
                last if ( $matrix_line =~ /^\s*$/ );

                chomp $matrix_line;

                $matrix_line =~ s/^\s+//;
                $matrix_line =~ s/\s+$//;

                # Adding pseudocount to the matrix elements
                my @row =
                  map { $_ += $pseudocount }
                  split( /\s+/, $matrix_line );

                push( @{ $hash_of_dbtfs{$matrix_name}{MATRIX} }, \@row );
                $hash_of_dbtfs{$matrix_name}{LENGTH} =
                  scalar @{ $hash_of_dbtfs{$matrix_name}{MATRIX} };

                $hash_of_dbtfs{$matrix_name}{NAME} = $matrix_name;
            }
        }
    }

    $database_fh->close();

    # Normalize the rows
    foreach my $tf ( values %hash_of_dbtfs ) {
        my $matrix = $tf->{MATRIX};

        foreach my $row ( @{$matrix} ) {
            my $row_sum = sum( @{$row} );
            map { $_ /= $row_sum } @{$row};
        }
    }

    my $IC;
    foreach my $tf ( values %hash_of_dbtfs ) {
    my $total_IC;
        my $matrix = $tf->{MATRIX};
	foreach my $row ( @{$matrix} ) {
	    my $ic= 0;
	    foreach my $elem ( @{ $row } ) {
		$ic += $elem * log2($elem);
	    }
	    $IC = 2 + $ic;
	    push ( @{ $tf->{IC_ARRAY} }, $IC );
	    $total_IC += $IC;
	}
    }

#    print Dumper(\%hash_of_dbtfs);
    return \%hash_of_dbtfs;
}

##############################################################
# Subroutines

# Calculate log_2
sub log2
{
    return INVLOGTWO * log(shift);
}

# Calculate the mutual information for the two matirces
sub double_sum
{
    my $s_mat   = shift;
    my $l_mat   = shift;
    my $s_start = shift;
    my $s_stop  = shift;
    my $l_start = shift;
    my $l_stop  = shift;

    my $sum = 0;
    my ( $IC_short, $IC_long ) = (0, 0);
    my $count = 0;

    # The outer sum over the current overlap. Overlap = length of
    # the shortest motif.
    for (
        my ( $s_pos, $l_pos ) = ( $s_start, $l_start ) ;
        $s_pos <= $s_stop ;
        ++$s_pos, ++$l_pos
      )
    {
        # Keeping track on the length of the current
        # overlap. This is later used for calculating information
        # content if the overlap was completely conserved.
	$count++;

        # The inner sum: over all 4 bases in each matrix. Length
        # is the length of overlap between the two matrices.

	###########################################################
	# Relative entropy between matrices
        my $s = $s_mat->[$s_pos];
        my $l = $l_mat->[$l_pos];

	my $s_ic = 0;
	my $l_ic = 0;
        for ( my $elem = 0 ; $elem < 4 ; ++$elem ) {
	    $s_ic -= $s->[$elem] * log2( $s->[$elem] );
	    $l_ic -= $l->[$elem] * log2( $l->[$elem] );
            $sum += $s->[$elem] * log2( $s->[$elem] / $l->[$elem] );
        }
	$IC_short += 2 - $s_ic;
	$IC_long += 2 - $l_ic;
    }

    # The total amount IC is 2 bits times the length of the
    # current overlap. If the actual IC for any of the comparing
    # partial matrices is less than %10 of the total IC, then
    # the comparison is excluded. The threshold has been set
    # arbitrarily but can be changed to a user defined threshold.
    my $conserved_overlap = 2 * $count;
    if ( ($IC_short/$conserved_overlap) > $threshold ||
	($IC_long/$conserved_overlap) > $threshold ) {

	return $sum;
    }

    # Return undef for all the calculations that do not pass the
    # above criteria.
    return undef;
}

sub overlap_loop
{
    my $meme_matrix = shift;
    my $db_matrix   = shift;
    my $meme_length = shift;
    my $db_length   = shift;

    my ( @values, @overlaps, @starts_stops );

    my ( $short, $long );
    my ( $short_matrix, $long_matrix );

    if ( $meme_length < $db_length ) {
        $short        = $meme_length;
        $long         = $db_length;
        $short_matrix = $meme_matrix;
        $long_matrix  = $db_matrix;
    } else {
        $short        = $db_length;
        $long         = $meme_length;
        $short_matrix = $db_matrix;
        $long_matrix  = $meme_matrix;
    }

    for ( my $offset = 1 - $short ; $offset < $long ; ++$offset ) {
        my ( $l_start, $l_stop, $s_start, $s_stop ) =
          overlap_length( $short, $long, $offset );

        my $overlap = abs( $s_start - $s_stop ) + 1;

	# Minimum length of the overlap = 6 bases
        if ( $overlap >= 6 ) {

            my $shortmat_longmat =
              double_sum( $short_matrix, $long_matrix, $s_start,
                $s_stop, $l_start, $l_stop );

            my $longmat_shortmat =
              double_sum( $long_matrix, $short_matrix, $l_start,
                $l_stop, $s_start, $s_stop );

	    next if ( ( !defined($shortmat_longmat) ) ||
		( !defined($longmat_shortmat) ) );

	    # Take mean of the two calculated scores
            my $mean = 0.5 *
		( ($shortmat_longmat + $longmat_shortmat) / $overlap );

            # Save the score (mean) and the length of overlaps
            # and the positions at which the mean score was
            # calculated at.
            push @values,   $mean;
            push @overlaps, $overlap;
            push @starts_stops,
              [ [ $s_start, $l_start ], [ $s_stop, $l_stop ] ];
        }
    }

    return ( \@values, \@overlaps, \@starts_stops );
}

# Calculate the start and stop positions for the current overlap
sub overlap_length
{
    my $s      = shift;
    my $l      = shift;
    my $offset = shift;

    my $l_start =
	( $offset > 0 ) ? $offset : 0;
    my $l_stop  =
	( $offset + $s < $l ) ? ( $offset + $s - 1 ) : $l - 1;
    my $s_start =
	( $offset < 0 ) ? $s - ( $l_stop - $l_start + 1 ) : 0;
    my $s_stop  =
	( $offset < 0 ) ? $s - 1 : $l_stop - $l_start;

    return ( $l_start, $l_stop, $s_start, $s_stop );
}

# Usage
sub usage
{
    print STDERR <<EOT;
matrix_comparison.pl version 1.0
Amonida Zadissa. 2006 (AgResearch, Otago University)

A utility for comparing two frequency matrices.

Usage:

\t$0
\t  -m motif_file (e.g. MEME_file in text format)
\t  -d database_file (containing the frequency matrices only)
\t  -s file_for_statistics
\t  -c conservation_threshold
\t  -p permutation_rounds
\t   [ -o output_file ]
\t$0 -h

EOT
}