# This file is part of ModPipe, Copyright 1997-2020 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

package MPLib::MPSelectHits;
require Exporter;
@ISA    = qw(Exporter);
@EXPORT = qw( SelectHits ClusterHitsByRegion CompareRegions RemoveDuplicates SortModelsByLength PickTopHits );


use strict;
use Cwd;
use File::Basename;

use PLLib::Utils;
use PLLib::Sequence;
use PLLib::Alignment;
use MPLib::MPInit;
use MPLib::MPUtils;


sub SelectHits {

   use ExtLib::Tie::IxHash;

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   die "${subname}__D> Insufficient arguments\n"
      unless ( scalar(@_) == $nargs );

   # --- Reassign input arguments
   my ($seqid, $hits) = @_;

   # --- Get the current directory
   my $currdir = cwd();

   # -- Sort the models by length (descending)
   my $hitl = SortMPDataByLength( $hits );

   # -- Copy the alignment files locally
   my %alifiles = (); 
   tie %alifiles, "Tie::IxHash";
   foreach my $hit ( @$hitl ){

      # -- Get the alignment id
      my $aliid = $hit->alignment->id;

      # -- Copy the alignment file over
      my $aliloc = undef;
      unless( $aliloc = CopyAlignmentMP($seqid, $aliid) ){
         warn "${subname}__E> Failed copying alignment: $aliid\n";
         next;
      }

      # -- Append array of filenames
      $alifiles{$aliid} = $aliloc;
   }

   # -- Report numnber of alignments
   my $alicnt = keys %alifiles;
   warn "${subname}__M> Alignments found for clustering: $alicnt\n";

   # -- Stop if there are no alignments
   return if ( $alicnt < 1 );

   # -- Return a single cluster if only one alignment
   if ( $alicnt == 1 ) {
      my @representatives = my @clustermembers = ();
      push @representatives, $hitl->[0];
      push @{ $clustermembers[$#representatives] }, $hitl->[0];
      return ( \@representatives, \@clustermembers );
   }

   # -- Create ali objects from all alignment files
   my @aliobjects = ();
   foreach my $ali ( values %alifiles ){
      my $aliobj = undef;
      $aliobj = ReadAlignment($aliobj, $ali, 'PIR', 'VERTICAL');
      push @aliobjects, $aliobj;
   }

   # -- Cluster Alignments
   my ( $repstack, $clusters ) = ClusterAlignments( \@aliobjects, $seqid,
                                                    $init::aliclust_ovlp, 
                                                    $init::aliclust_pcovlp,
                                                    $init::aliclust_nonovlp, 
                                                    $init::aliclust_pcnonovlp, 
                                                    $init::aliclust_idcol, 
                                                    $init::aliclust_pcidcol);

   # -- Check clustering results
   unless ( scalar(@$repstack) == scalar(@$clusters) &&
            scalar(@$repstack) > 0 ){
      warn "${subname}__E> Clustering failed\n";
      return;
   }

   warn "${subname}__M> Clustering produced ", scalar(@$repstack), " representatives\n";

   # -- Write out the results into files
   my %reverse_alifiles = reverse %alifiles;

   my @representatives = my @clustermembers = ();
   foreach my $i ( 0 .. $#$repstack ){
      # -- Get the representative alignment's id
      my $ali = $repstack->[$i];
      my $alifile = $ali->filename;
      my $rep_aliid = $reverse_alifiles{$alifile};
      my @hitlines = grep { $_->alignment->id eq $rep_aliid } @$hitl;
      push @representatives, $hitlines[0];

      # -- Fetch the members of the cluster
      foreach my $aliobj ( @{ $clusters->[$i] } ){
         my $membfile = $aliobj->filename;
         my $memb_aliid = $reverse_alifiles{$membfile};
         my @hitlines = grep { $_->alignment->id eq $memb_aliid } @$hitl;
         push @{ $clustermembers[$#representatives] }, $hitlines[0];
      }
   }

   # -- Return relevant arrays of hit lines
   return ( \@representatives, \@clustermembers );
}

sub ClusterHitsByRegion {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # -- Reassing input arguments
   my $datl = $_[0];

   # -- Sort the models by length (descending)
   $datl = SortModelsByLength( $datl );

   # -- Return null reference if empty
   return [] unless ( @$datl );

   # -- Cluster models by region
   my @org_stack = my @rep_stack = my @clusters = ();
   my @org_stack = @$datl;

   while ( @org_stack > 0 ){

      # -- Pop the first sequence in the original stack
      #    as a representative
      push(@rep_stack, $org_stack[0]);

      # -- Initialize the auxillary stack
      my @aux_stack = ();

      # -- Compare all elements of the original stack
      #    against the current representative, store
      #    the ones that do not cluster in the auxillary
      #    stack
      foreach my $org ( @org_stack ){

         # -- Compare the elements
         my $compare = CompareRegions($rep_stack[$#rep_stack], $org);

         # -- Append appropriate arrays
         if ( $compare ){
            push @{ $clusters[$#rep_stack] }, $org;
         } else {
              push(@aux_stack, $org);
           }
      }

      # -- Now copy the auxilary stack back to the
      #    original for the next round
      @org_stack = @aux_stack;
   }

   # -- Return the cluster array
   return \@clusters;
}

sub CompareRegions {

   # Determines the overlap between hits. 
   # calles GetOverlap twice - 
   # first to determine the overlap 
   # between the longer and the shorter sequence. 
   # second to determine the overlap between the shorter and the longer sequence

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # -- Reassign input arguments
   my ($repl, $cmpl) = @_;

   # -- Get the range of models
   
   my ($ref_beg, $ref_end) = @{$repl->region};
   my ($cmp_beg, $cmp_end) = @{$cmpl->region};
   my ($ref_seq_ident) = $repl->highest_sequence_identity;
   my ($cmp_seq_ident) = $cmpl->highest_sequence_identity;

   # -- Get the overlap numbers
   my ($overlap, $percent_overlap, $non_overlap, $percent_non_overlap, $overlap_beg, $overlap_end) =
      GetOverlap($ref_beg, $ref_end, $cmp_beg, $cmp_end);

   # -- Determine if this should be a separate cluster
   my $compare = ( $non_overlap > 30 || $percent_non_overlap > 30) ? 0 : 1;

   # -- If member falls into an existing cluster, check whether it is significiantly shorter. 
   # -- If yes - start its own cluster
   # -- Todo: check first whether the threshold difference (highest_sequence_identity) suggests that
   # -- this region should be included in the first cluster

   if ($compare == 1) {
      ($overlap, $percent_overlap, $non_overlap, $percent_non_overlap, $overlap_beg, $overlap_end) =
      GetOverlap($cmp_beg, $cmp_end, $ref_beg, $ref_end);
      $compare = ( $non_overlap > 30 || $percent_non_overlap > 30) ? 0 : 1;
   }
   print "${subname}__M> Compared regions ${ref_beg}-${ref_end}\t ${cmp_beg}-${cmp_end}\t${compare}\t${non_overlap}\t${percent_non_overlap}\n";

   # -- Return status
   return $compare;
}

sub RemoveDuplicates {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # -- Reassing input arguments
   my $datl = $_[0];
   return [] unless (@$datl);

   # -- Remove duplicates
   my @org_stack = @$datl; my @rep_stack = ();
   while ( @org_stack > 0 ){
      push(@rep_stack, $org_stack[0]);
      my @aux_stack = ();
      foreach my $org ( @org_stack ){
         next if ( $org eq $rep_stack[$#rep_stack] );
         push @aux_stack, $org;
      }
      @org_stack = @aux_stack;
   }

   # -- Return the rep.stack
   return \@rep_stack;
}


sub SortModelsByLength {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # -- Reassing input arguments
   my $datl = $_[0];
   my @sorted_datl = ();

   # -- Sort models by length
   if (ref($datl) eq 'ARRAY') {
      @sorted_datl = map { $_->[1] }
                        sort { $b->[0] <=> $a->[0] }
                        map { [ ((split(/\|/, $_))[3] - (split(/\|/, $_))[2] + 1), $_ ] }
                        @$datl;
   } 
   return \@sorted_datl;


   # -- Return sorted array
}

sub PickTopHits {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ( $dataline ) = shift @_;

   # -- Return if there are no models to select or there is only one
   if ( scalar(@$dataline) < 1 ) {
      return [];
   } elsif ( scalar(@$dataline) == 1 ) {
     my $line = @$dataline[0];
     print "${subname}__M> Region with one hit returned in Region : ".
           join("-",@{$line->region}). " with " .
           $line->highest_sequence_identity .
           "% sequence identity\n"; 
     return $dataline;
   }
   my $highest = 0;
   my $second;
   my $tophit;
   my @outlines;
   foreach my $line (@$dataline) {
      if ($highest < $line->highest_sequence_identity)  {
         $tophit = $line;
         $second = $highest; 
         $highest =  $line->highest_sequence_identity;
      }
   }
      
   # -- Assemble Return
   # -- Threshold: min 30% sequence identity and all entries 20% lower. 
   # -- All entries are returned 
   my $line = @$dataline[0];
   if (!$tophit) {
     print "${subname}__W> No hits above 0% seqid found for Region : ".
           join("-",@{$line->region}) . "\n";
      return;
   }
   # for debugging purpose:
   if ($tophit && !$tophit->region) {
      @$tophit->region = [0,0];
   }
   print "${subname}__M> New Region started with $highest % highest sequence identity ( first entry: ".join("-",@{$line->region}).")\n";
   print "${subname}__M> First entry: ".join("-",@{$line->region})."\n";
   print "${subname}__M>  Top entry: " .join("-",@{$tophit->region})."\n";
   print "${subname}__M> ".scalar(@$dataline)." entries in region\n";
   if ($highest >= 30) {
      foreach my $line (@$dataline) {
         if ($line->highest_sequence_identity >= ($highest-20)) {
             print "${subname}__M> Top Alignment identified with ".
                $line->highest_sequence_identity."% Sequence identity in Region ".
                join("-",@{$line->region})."\n";
             push @outlines,$line;
         } else {
             print "${subname}__M>  Alignment rejected with ".
                $line->highest_sequence_identity."% Sequence identity in Region ".
                join("-",@{$line->region})."\n";
         }
       }
       return \@outlines;
   } else {
      print "${subname}__M> No Top Alignment identified in region, all alignments selected.\n";
      print "${subname}__M> Returning $dataline\n";
      return $dataline;
   }
}

