#!/usr/bin/perl
# This file is part of ModPipe, Copyright 1997-2014 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

use strict;
use Getopt::Long;
use File::Basename;

use MPLib::Version;
use PLLib::Utils;
use PLLib::Sequence;
use PLLib::Alignment;

my $tgtcode = '';
my $alilist  = '';
my @alifiles = ();
my $alifmt = 'PIR';
my $outfile = '';
my $grpfile = '';
my $lrgfile = '';
my $ovlp = 30;
my $pcovlp = 30;
my $nonovlp = 10;
my $pcnonovlp = 10;
my $idcol = 0;
my $pcidcol = 20;
my $help = '';

# -- Get command line options
GetOptions (
           "target_code=s"           => \$tgtcode,
           "alignments_list=s"        => \$alilist,
           "alignments_format=s"     => \$alifmt,
           "output_filename=s"       => \$outfile,
           "groups_filename=s"       => \$grpfile,
           "largest_filename=s"      => \$lrgfile,
           "min_overlap=i"           => \$ovlp,
           "min_pc_overlap=f"        => \$pcovlp,
           "max_nonoverlap=i"        => \$nonovlp,
           "max_pc_nonoverlap=f"     => \$pcnonovlp,
           "min_identity=i"          => \$idcol,
           "min_pc_identity=f"       => \$pcidcol,
           "help"                    => \$help,
           "version"                 => sub { VersionMessage() },
           );

# -- Print usage information
if ( $help ) {
   &usage;
   exit 0;
}

# --- Get Program name
my $subrname = GetSubrName();

# -- Check input arguments
unless ( $tgtcode && $alilist){
   print "${subrname}__E> Missing mandatory options\n\n";
   die "Try $subrname --help for usage information\n";
}

# -- Set the various default parameters
$outfile = $outfile ? $outfile : fileparse($alilist, '\..*') . ".sel";
$grpfile = $grpfile ? $grpfile : fileparse($alilist, '\..*') . ".grp";
$lrgfile = $lrgfile ? $lrgfile : fileparse($alilist, '\..*') . ".lrg";

# --- Check for existence of the file with alignment filenames
die "${subrname}__E> Could not find file with alignment filenames: $alilist\n"
   unless ( -e $alilist );

# -- Check alignment formats
die "${subrname}__E> This script currently handles ONLY PIR formats for alignments\n"
  unless ( $alifmt =~ /\bPIR\b/i );

# -- Get the architecture of the machine
my $arch;
$arch = GetArch()
  or die "${subrname}__E> Cannot run on this architecture: $arch\n";

# -- Read in the list of alignments to merge
my $fh_alilist = OpenFile($alilist);
while ( chomp(my $alifile = <$fh_alilist>) ){
  if ( -e $alifile ){
    push @alifiles, $alifile;
  } else {
      warn "${subrname}__E> Could not find alignment file $alifile\n";
      next;
    }
}

# -- Ensure there is at least one file to process
die "${subrname}__E> Not enough alignment files to cluster\n"
  unless ( scalar(@alifiles) > 2 );

# -- Read in the alignment files into an array of ali objects
my @alns = ();
foreach my $ali ( @alifiles ){
  my $a = undef;
  $a = ReadAlignment($a, $ali, 'PIR', 'VERTICAL');
  push @alns, $a;
}

# -- Sort the alignments by the length of the aligned portion
#    of the target
@alns = map { $_->[1] }
        sort { $b->[0] <=> $a->[0] }
        map { [ GetTargetLength($_, $tgtcode), $_ ] }
        @alns;

# -- Cluster the alignments
my $repstack = my $clusters = [];
($repstack, $clusters) = ClusterAlignments(\@alns, $tgtcode,
                              $ovlp, $pcovlp, $nonovlp, $pcnonovlp,
                              $idcol, $pcidcol);

# -- Check clustering results
die "${subrname}__E> Clustering failed\n"
  unless ( scalar(@$repstack) == scalar(@$clusters) && 
           scalar(@$repstack) > 0 );

warn "${subrname}__M> Clustering produced ", scalar(@$repstack), " representatives\n";

# -- Write out the representative alignments
my $fh_out = OpenNewFile( $outfile );
my $fh_grp = OpenNewFile( $grpfile );
foreach my $i ( 0 .. $#{ $repstack } ){
  my $ali = $repstack->[$i];
  print $fh_out $ali->filename, "\n";

  print $fh_grp $ali->filename, " : ", scalar(@{$clusters->[$i]}), "\n";
  foreach my $a ( @{ $clusters->[$i] } ){
    print $fh_grp "   ", $a->filename, "\n";
  }
}
close($fh_grp);
close($fh_out);

# -- Write out the members from the largest cluster
my @sorted_clusters = map { $_->[1] }
                      sort { $b->[0] <=> $a->[0] }
                      map { [ scalar(@$_), $_ ] }
                      @$clusters;

my $fh_lrg = OpenNewFile( $lrgfile );
foreach my $a ( $sorted_clusters[0] ){
  foreach my $b ( @$a ){
    print $fh_lrg $b->filename, "\n";
  }
}
close($fh_lrg);

exit 0;

sub GetTargetLength {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ( $ali, $tgtcode ) = @_;

   # -- Get the Index of target in the alignment
   my $tgtidx = IndexInList($ali->code, $tgtcode);
   die "${subname}__E> Target code not found in alignment: $tgtcode\n"
     unless ( $ali->code->[$tgtidx] eq $tgtcode );

   # -- Remove gapped positions
   my $seq = RemoveDashes($ali->sequence->[$tgtidx]);

   # -- Get sequence length
   my $seqlen = length($seq);

   # -- Return length of sequence
   return $seqlen;
}

# --- Usage
sub usage {
print <<EOF;
${0}:

This script will take a file with the locations of several alignments and cluster
them using a greedy algorithm, to return a set of representative alignments. At
least one sequence should be common to all alignments. The alignments will be
sorted by length of this reference sequence. So the representatives will automatically
be the longest member of the cluster.

      --target_code             Code of the sequence common to all alignments
                                Mandatory option.

      --alignments_list         File containing the list of alignments that have
                                to get clustered. Should contain full paths.
                                Mandatory option.

      --alignments_format       Format of the alignments going to be clustered.
                                Currently handles only PIR.
                                Default: PIR

      --output_filename         Name of the output file that will contain
                                the representative alignments.
                                Default: <basename-of-anchor>.sel

      --groups_filename         Name of the output file that will contain
                                the clusters.
                                Default: <basename-of-anchor>.grp

      --largest_filename        Name of the output file that will contain
                                the largest cluster.
                                Default: <basename-of-anchor>.lrg

      --min_overlap             Minimum number of alignment positions that should
                                be common to two alignments to be clustered together.
                                Default: 0

      --min_pc_overlap          Same as above but expressed as a percentage of the 
                                length of the alignment.
                                Default: 90

      --max_nonoverlap          Maximum number of alignment positions that are *not*
                                common to both alignments being compared.
                                Default: 10
                        
      --max_pc_nonoverlap       Same as above but expressed as a percentage of the 
                                length of the alignment
                                Default: 10

       --min_identity           Minimum number of identical aligned positions between
                                the two alignments being compared.
                                Default: 0

       --min_pc_identity        Same as above but expressed as a percentage of the 
                                number of equivalent aligned positions.
                                Default: 90

      --version                 Report version number of this program.

      --help                    This help.

EOF
}

