#!/usr/bin/perl
# This file is part of ModPipe, Copyright 1997-2014 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

use Getopt::Long;
use File::Basename;
use File::Path;
use Cwd;
use strict;

# --- Load local modules
use PLLib::Utils;
use PLLib::Sequence;
use PLLib::Modeller;
use PLLib::TSVModUtils;

use MPLib::Version;
use MPLib::Binaries;
use MPLib::MPInit;
use MPLib::MPUtils;
use MPLib::MPModules;
use MPLib::MPSelectHits;
use MPLib::Serialize;

$0 = "modpipe build";

# --- Get command line options
$Getopt::Long::ignorecase = 0;

my ( $seqid, $conffile, $exitstage, @hitsmode, $evaluehits, $help, $tsvmod_flag,
     $natpdb, $natchn, $cleanup, $template, $template_option, $template_fast, $scrstat,@gather_options);

GetOptions (
           "conf_file=s"		=>	\$conffile,
	   "sequence_id=s"		=>	\$seqid,
	   "final_models_by=s"		=>	\@gather_options,
	   "score_by_tsvmod=s"		=>	\$tsvmod_flag,
	   "native_pdb=s"		=>	\$natpdb,
	   "native_chn:s"		=>	\$natchn,
	   "exit_stage=s"		=>	\$exitstage,
	   "hits_mode=s"		=>	\@hitsmode,
	   "evalue_hits=f"		=>	\$evaluehits,
           "set_score_statistics=s"     =>      \$scrstat,
	   "clean_up=s"		        =>	\$cleanup,
	   "template=s"			=>	\$template,
	   "template_option=s"		=>	\$template_option,
	   "help"			=>	\$help,
           "version"                    =>      sub { VersionMessage() },
           );

# --- Check command line options
if ( $help ){
   &usage;
   exit 0;
}

$tsvmod_flag=uc($tsvmod_flag);
if ($template_option eq "TEMPLATE_FAST") {
    $template_option="TEMPLATE";
    $template_fast="ON";
    $scrstat="OFF";
} else {
    $template_fast="OFF";
}

# --- Get Program name
my $subrname = GetSubrName();

# --- Check for configuration file
die "${subrname}__E> Cannot proceed without configuration file\n"
   unless ( $conffile && -e $conffile );

# --- Check for sequence md5 hash
die "${subrname}__E> Cannot proceed without sequence id\n"
   unless ( $seqid );

# --- Read in the configuration file
die "${subrname}__E> Failed initializing ModPipe\n"
   unless ( ModPipeInit( $conffile ) );

# --- Check if sequence exists
my $seqdir = SeqDirMP( $seqid );
my $seqnam = SeqFileMP( $seqid );
die "${subrname}__E> Sequence file not found: $seqnam\n"
   unless ( -e $seqnam );

# --- Create the temporary directory for local stuff
my $tmpdir = "${init::tmpdir}/${seqid}";
MakeDirSys($tmpdir);

@hitsmode = &CleanHitsModes( @hitsmode );
   
# --- Set default values
$exitstage  = 'NONE' unless ( $exitstage );
$cleanup    = 'ON'   unless ( $cleanup );
$evaluehits = 1.0    unless ( $evaluehits );
$scrstat   = 'ON' unless ( $scrstat =~ /OFF/i );

$natchn   = ' ' if ( ! $natchn || $natchn eq '' || $natchn eq '_' );
$natpdb     = undef unless ( $natpdb );
warn "${subrname}__M> Running in benchmarking mode. Target PDB: ${natpdb}\n"
   if ( defined($natpdb) );

# --- Store the current directory
my $currdir = cwd();

# --- Move into temporary directory
chdir( $tmpdir );

# -----------------------
# MODPIPE actions begin
# -----------------------


# --- Quit if this is the exit stage
# todo: check which profiles where requested. 
if ( $exitstage =~ /^PROFILE$/i ){
   unless ( GetProfile( {'seqid'=>$seqid, 'evalue_hits'=>$evaluehits} ) ){
      warn "${subrname}__E> Failed to calculate BUILD_PROFILE profile\n";
      warn "${subrname}__E>    Sequence: $seqid\n";
      exit 1;
  }
   unless ( GetPsiBlastProfile( $seqid, $evaluehits, "default" ) ){
      warn "${subrname}__E> Failed to calculate PSI-Blast profile\n";
      warn "${subrname}__E>    Sequence: $seqid\n";
      exit 1;
   }
   warn "${subrname}__M> Chosen exit stage: $exitstage\n";
   warn "${subrname}__M>    ... Will stop here\n";
   chdir($currdir);
   CleanupTMP($tmpdir) if ( $cleanup =~ /\bON\b/i );
   exit 0;
}

# --- Get hits against template database

# --- Open the hit file
my $hitfile = "${seqid}.hit";
my $fh_hit  = OpenNewFile( $hitfile );


# --- Seq-Seq hits
# -- Alwas get Seq-Seq hits
my $outhits = [];
$outhits = &GetHits100( $seqid, $natpdb, $natchn, $outhits, $evaluehits, $template_fast )
      or die "${subrname}__E> Failed to get Seq-Seq hits: $seqid\n";
if (!$outhits) {
   $outhits = [];
}

#  Now get the other requested hitsmodes 

foreach my $hitsmode (@hitsmode) {

   my $newhits = &GetHits($seqid, $hitsmode, $natpdb, $natchn, $evaluehits, $template_fast, $scrstat);

   # combine new outhits and previous picked hits
   if ((ref($outhits) eq 'ARRAY') && (ref($newhits) eq 'ARRAY')) {
           my @outhits = (@$outhits,@$newhits);
           $outhits = \@outhits;
   } elsif (ref($newhits eq 'ARRAY')) {
      $outhits = $newhits;
   } 

   # Top filtering is not reliable for the HH hitsmodes. Since the HH hitsmodes are processed last,
   # we skip them here. 
   if (($template_option eq "TOP") && (substr($hitsmode,0,2) ne "HH")) {
      $outhits = FilterTemplates(($outhits),$template);
   } 
}
WriteHitsFile($outhits, $fh_hit);

close($fh_hit);
   
# -- Count the number of hits
my $hitcnt = scalar(@$outhits);
print "${subrname}__C> ${hitcnt} hits found for all hitsmodes \n";

# --- Quit if there are no hits to process
unless ( $hitcnt > 0 ){
   warn "${subrname}__M> No hits found against template database\n";
   warn "${subrname}__M>    ... Will stop here\n";
   chdir($currdir);
   CleanupTMP($tmpdir) if ( $cleanup =~ /\bON\b/i );
   exit 0;
}

# --- If comparing input template with other templates, but input template
#     is not among hits, quit.
my $template_hit;
if ( $template && ($template ne "NONE")) {
   $template_hit = FindTemplate( $outhits, $template,$template_option);
   unless ( $template_hit) {
      warn "${subrname}__M> Selected Template $template not in hits\n";
      warn "${subrname}__M>    ... Will stop here\n";

      # --- Copy the hitfile to seqdir for diagnostic purposes
      die "${subrname}__E> Failed copying hit file to Modpipe file structure: ${seqdir}/${hitfile}.no_template\n"
         unless( CopyFile($hitfile, "${seqdir}/${hitfile}.no_template") );

      chdir($currdir);
      CleanupTMP($tmpdir) if ( $cleanup =~ /\bON\b/i );
      exit 0;
   }
}

# --- Copy the hitfile to seqdir
die "${subrname}__E> Failed copying hit file to repository\n"
   unless( CopyFile($hitfile, "${seqdir}/${hitfile}") );

# --- Cluster hits (alignments) to remove redundancy

# --- Select hits for modeling by clustering
#     available alignments for redundancy

# --- Specify filenames
my $selfile = "${seqid}.sel";
my $fh_sel  = OpenNewFile( $selfile );

# -- Cluster alignments only if requested
#    else just copy the contents of the hit file
#    into the sel file
if ( $init::clusterali =~ /\bON\b/i && $hitcnt > 1){
   # --- Cluster alignments
   my ($representatives, $members) = SelectHits( $seqid, $outhits)
      or die "${subrname}__E> Failed hit selection: $seqid\n";

   # --- If template-of-interest comparison, make sure TOI remains selected
   if ( $template && ($template ne "NONE")) {
      # ---See if input template among selected hits
      my $template_f
         = FindTemplate( $representatives, $template,$template_option);
      unless ( $template_f ) {
         push @$representatives, $template_hit;
         warn "${subrname}__M> Adding input template to clustered alignments\n";
      }
   }

   # --- Write the sel file
   if ( $template && ($template ne "NONE") && ($template_option eq "TEMPLATE")) {
      $representatives=FilterTemplates($representatives,$template); 
   }
   WriteHitsFile($representatives, $fh_sel);
} else {

   warn "${subrname}__M> Alignments will not be clustered\n";
   warn "${subrname}__M>  - Option set to OFF or found single alignment\n";

   # -- Write hits to sel file
   my $representatives;
   if ( $template && ($template ne "NONE") && ($template_option eq "TEMPLATE")) {
      $representatives=FilterTemplates($outhits,$template); 
      WriteHitsFile($representatives, $fh_sel);
   } else {
      WriteHitsFile($outhits, $fh_sel);
   }
}

# -- Close the sel file
close( $fh_sel);

# --- Copy the selfile to seqdir
die "${subrname}__E> Failed copying sel file to repository\n"
   unless( CopyFile($selfile, "${seqdir}/${selfile}") );

# --- Quit if this is the exit stage
if ( $exitstage =~ /^ALIGNMENTS$/i ){
   warn "${subrname}__M> Chosen exit stage: $exitstage\n";
   warn "${subrname}__M>    ... Will stop here\n";
   chdir($currdir);
   CleanupTMP($tmpdir) if ( $cleanup =~ /\bON\b/i );
   exit 0;
}

# --- Build models for all hits in the hitfile

# --- Open the mod file
my $modfile = "${seqid}.mod";
my $fh_mod  = OpenNewFile( $modfile );

# --- Build models
GetModels( $seqid, $fh_mod )
   or die "${subrname}__E> Failed to build models: $seqid\n";

# --- Close the mod file
close($fh_mod);

# --- Copy the modfile to seqdir
die "${subrname}__E> Failed copying mod file to repository\n"
   unless( CopyFile($modfile, "${seqdir}/${modfile}") );

# --- Rate models using TSVMod
#     Uses external TSVMod program to rate
#     Returns Method (MatchbySS, full or reduced(SF, SR), MatchbyTemplate (T)  
#     or "not enough similar structures" (NA), and predicted RMSD and NO35
my $newmodfile = ScoreByTSVMod (${init::tsvmodexe},$seqid,$tsvmod_flag)
   or warn "${subrname}__W> Failed scoring models by TSVMod\n";

# --- Copy the new modfile to seqdir
die "${subrname}__E> Failed copying mod file to repository\n"
   unless( CopyFile($newmodfile, ModFileMP($seqid)) );

# --- Rate models based on sequence/structure quality measures
#     This takes the model yaml file and introduces two new
#     entries - one with individual ratings and one with a 
#     combined rating (mpqs). The latter, for now, is simply the
#     sum of all individual scores.
my $newmodfile = RateModels( $seqid,$tsvmod_flag ) 
   or warn "${subrname}__W> Failed rating models\n";

# --- Copy the new modfile to seqdir
die "${subrname}__E> Failed copying mod file to repository\n"
   unless( CopyFile($newmodfile, ModFileMP($seqid)) );

# --- Move back to original directory
chdir($currdir);

# If gathering local, setup GatherModMP.py
my ($gather_options,$score,$option);
if ( scalar(@gather_options)>0) {
     $gather_options=" -c $conffile";
     $gather_options.=" --seq_id ${seqid}";
     $gather_options.=" --local_only True";
     my @allscores=("MPQS","ALL","DOPE","INPUT_TEMPLATE","LONGEST_GA341",
                 "SEQID","GA341","LONGEST_DOPE");
     if ($tsvmod_flag ne "OFF") {
         push @allscores,"TSVMOD";
     }
     my @temp;
     foreach $option(@gather_options) {
         push @temp,split(/\,/,$option);
     }
     @gather_options=@temp;
     foreach $score (@allscores) {
         foreach $option (@gather_options) {
             if ($score eq $option) {
                 if ($option eq "INPUT_TEMPLATE") {
                     if ( $template && ($template ne "NONE")) {
                         $gather_options.=" --template ${template}";
                     }
                 }
                 $gather_options.=" --final_models_by ${option}";
             }
         }
     }
               
    my $command = GetModPipeScript("main/GatherModMP.py")
                    . " $gather_options 2>&1";
    # -- Run Command
    if ( system( $command ) ){
       warn "${subrname}__E> Failed to collect models from repository\n";
       warn "${subrname}__E> $command\n";
       die  "${subrname}__E> ... Will exit\n";
    } 
}

# --- Clean up temporary directory
CleanupTMP($tmpdir) if ( $cleanup =~ /\bON\b/i );

# --- Exit finally
exit 0;


sub FilterTemplates {

   
   my $hits=shift@_;
   my $template=shift@_;
   my (@filtered_hits);

   if ($template_option eq "TOP") {
      # select only the top templates to reduce processing time
      # variables are in $hit->sequence (length of sequence), 
      # $hit->highest_sequence_identity and
      # @$hit->region (beginning and ending residue of region)
      # use modified routines from the old perl GatherMP script
      my $regions = [ ];
      my $final = [ ];
      my $filtered_hits = [ ];

      $regions = ClusterHitsByRegion($hits);
      for my $count ( 0 .. $#$regions ) {
         my $selfin = PickTopHits($regions->[$count]);
         if (defined($selfin)) {
            my @newhits = (@$filtered_hits, @$selfin);
            $filtered_hits = \@newhits;
         }
      }
#         my $filtered_hits = RemoveDuplicates( $final );
      return $filtered_hits;
         

   } elsif ($template_option eq "TEMPLATE") {
      foreach my $hit (@$hits) {
        foreach my $template_hit (@{$hit->templates}) {
         # keys are code, chain, region and sequence_identity
           if ($template_hit->code . $template_hit->chain eq $template) {
              push @filtered_hits,$hit;
           }
        }
     }
     return \@filtered_hits;
  }
}


# --- Usage
sub GetHits {

  my $seqid = shift @_;
  my $hitsmode = shift @_;
  my $natpdb = shift @_;
  my $natchn = shift @_;
  my $evaluehits = shift @_;
  my $template_fast = shift @_;
  my $srcstat = shift @_;
  my $outhits = [];

  # todo: add another check that creates the profile first - to avoid that OFF is always executed

  if ( $hitsmode eq "Prf-Seq") {
  # --- Prf-Seq hits

     unless ( GetProfile( {'seqid'=>$seqid, 'evalue_hits'=>$evaluehits, 'prfupdate' => "OFF"} )){
        warn "${subrname}__E> Failed to calculate BUILD_PROFILE profile\n";
        warn "${subrname}__E>    Sequence: $seqid\n";
        exit 1;
     }
     $outhits = GetHits010( $seqid, $natpdb, $natchn, $outhits, $evaluehits, $template_fast )
        or die "${subrname}__E> Failed to get Prf-Seq hits: $seqid\n";

  } elsif ( $hitsmode eq "PSI-Blast-Prf-Seq" ) {
  # -- Use PSI-Blast generated profiles 

     unless ( GetPsiBlastProfile( $seqid, $evaluehits, "OFF" ) ){
         warn "${subrname}__E> Failed to calculate PSI-Blast profile\n";
         warn "${subrname}__E>    Sequence: $seqid\n";
         exit 1;
     }
     $outhits = GetHits020( $seqid, $natpdb, $natchn, $outhits, $evaluehits, $template_fast )
        or die "${subrname}__E> Failed to get PSI-Blast-Prf-Seq hits: $seqid\n";

  } elsif ($hitsmode eq "Prf-Prf") {
     # --- Prf-Prf hits

     unless ( GetProfile( {'seqid'=>$seqid, 'evalue_hits'=>$evaluehits, 'prfupdate' => "OFF"} )){
        warn "${subrname}__E> Failed to calculate BUILD_PROFILE profile\n";
        warn "${subrname}__E>    Sequence: $seqid\n";
        exit 1;
     }
     $outhits = GetHitsPP( $seqid, $natpdb, $natchn, $outhits, $evaluehits, $scrstat,
        'BP', 'CCMAT', '0010', -100, -700, -70 )
        or die "${subrname}__E> Failed to get Prf-Prf hits (0010): $seqid\n";

  } elsif ( $hitsmode eq "PSI-Blast_Prf_Prf" ) {
  # -- Use PSI-Blast generated profiles 
  
   unless ( GetPsiBlastProfile( $seqid, $evaluehits, "OFF" ) ){
      warn "${subrname}__E> Failed to calculate PSI-Blast profile\n";
      warn "${subrname}__E>    Sequence: $seqid\n";
      exit 1;
   }
   $outhits = GetHitsPP( $seqid, $natpdb, $natchn, $outhits, $evaluehits, $scrstat,
        'PB', 'CCMAT', '0020', -100, -700, -70 )
        or die "${subrname}__E> Failed to get Prf-Seq hits (0020): $seqid\n";

  
  
  } elsif ( $hitsmode eq "Seq-Prf" ) { 
     # --- Seq-Prf hits - 0001

     $outhits = GetHitsPP( $seqid, $natpdb, $natchn, $outhits, $evaluehits, $scrstat,
        'SS', 'SEQPRF', '0001', -450, -500, -50 )
        or die "${subrname}__E> Failed to get Seq-Prf hits (0001): $seqid\n";

  } elsif ($hitsmode eq "Max-PSSM-Seq-Prf") {
     # --- Max-PSSM-Seq-Prf hits with BuildProfile - 0002

     unless ( GetProfile( {'seqid'=>$seqid, 'evalue_hits'=>$evaluehits, 'prfupdate' => "OFF"} )){
        warn "${subrname}__E> Failed to calculate BUILD_PROFILE profile\n";
        warn "${subrname}__E>    Sequence: $seqid\n";
        exit 1;
     }

     $outhits = GetHitsPP( $seqid, $natpdb, $natchn, $outhits, $evaluehits, $scrstat,
        'BP', 'CONS_BY_MAXPSSM', '0002', -450, -500, -50 )
        or die "${subrname}__E> Failed to get Seq-Prf hits (0002): $seqid\n";

  } elsif ( $hitsmode eq "Max-Freq-Seq-Prf") {
  # --- Max-Freq-Seq-Prf hits with PSI-Blast 

   unless ( GetPsiBlastProfile( $seqid, $evaluehits, "OFF" ) ){
      warn "${subrname}__E> Failed to calculate PSI-Blast profile\n";
      warn "${subrname}__E>    Sequence: $seqid\n";
      exit 1;
   }
   $outhits = GetHitsPP( $seqid, $natpdb, $natchn, $outhits, $evaluehits, $scrstat,
       'BP', 'CONS_BY_MAXFREQ', '0004', -450, -500, -50 )
       or die "${subrname}__E> Failed to get Seq-Prf hits (0004): $seqid\n";

  } elsif ( $hitsmode eq "HHBlitsSP") {
    $outhits = GetHitsHHBlits( $seqid, $outhits, $evaluehits, "SP", "OFF" )
       or die "${subrname}__E> Failed to get HHBlits Hits: $seqid\n";

  } elsif ( $hitsmode eq "HHBlitsPP") {
    $outhits = GetHitsHHBlits( $seqid, $outhits, $evaluehits, "PP", "OFF" )
       or die "${subrname}__E> Failed to get HHBlits Hits: $seqid\n";

  } elsif ($hitsmode eq "HHSearchSP") {
    $outhits = GetHitsHHSearch( $seqid, $outhits, $evaluehits, "SP", "OFF" )
       or die "${subrname}__E> Failed to get HHBlits Hits: $seqid\n";

  } elsif ($hitsmode eq "HHSearchPP") {
    $outhits = GetHitsHHSearch( $seqid, $outhits, $evaluehits, "PP", "OFF" )
       or die "${subrname}__E> Failed to get HHBlits Hits: $seqid\n";
  }
  
  return $outhits
}

sub usage {
print <<EOF;

${0}:

      --version                 Report version number of this program.
      --help                    This help. Pipe it through 'more' if it
                                scrolls off the screen.
      --conf_file		ModPipe configuration file. Mandatory. 
      --sequence_id		Sequence Id (MD5 digest of the sequence 
                                in the ModPipe filesystem. Mandatory.
      --native_pdb              The PDB code of the target sequence. A
                                convenience feature for benchmarking with PDB sequences. 
      --native_chn              The chain identifier for native structure. (See above).
      --exit_stage		Choose the exit stage for the program. You can
				quit after one of the following: PROFILE,
				ALIGNMENTS, MODELS.  Default: NONE
      --hits_mode		Mode(s) for finding template hits. Typically
                                several modes are used. 
                                Input as comma-separated list or in several statements
                                The following modes are available:
                                 * Seq-Seq: Sequence-Sequence search.
                                 * Prf-Seq: Profile-Sequence search using MODELLER
                                         profile.build() profiles.
                                 * PSI-Blast-Prf-Seq: Profile-Sequence search using PSI-BLAST
                                         profiles.
                                 * Prf-Prf: Profile-Profile search using MODELLER
                                         profile.build() profiles.
                                 * PSI-Blast-Prf-Prf: Profile-Profile search using PSI-BLAST
                                         profiles.
                                 * Seq-Prf: Sequence-Profile search.
                                 * Max-PSSM-Seq-Prf: Sequence-Profile search with
                                             Max-PSSM scoring.
                                 * Max-Freq-Seq-Prf: Sequence-Profile search with
                                             Max-frequency scoring.
                                 * HHBlitsPP: Fast Profile-Profile search using HHBlits (Söding). 
                                 * HHBlitsSP: Fast Sequenc-Profile search using HHBlits (Söding). 
                                 * HHSearchPP:  Profile-Profile search using HHSearch (Söding). 
                                 * HHSearchSP:  Sequence-Profile search using HHSearch (Söding). 
                                   HH-methods require installation of HHSuite, including databases 
                                   (see documentation)
      --evalue_hits		The E-value threshold to get hits against
				template databases. This value controls hits
				from all three searches.  Default: 1.0
      --set_score_statistics	This should switched to OFF when testing since
                                the test database does not have enough profiles.
                                Only applies to profile-profile alignments
                                (hits modes Prf-Prf and PSI-Blast-Prf-Prf).  Default: ON
      --clean_up		Flag to clean up the temporary directory
				after all operations ([ON], OFF). 
      --template                If present, creates models only if this PDB ID
                                and chain (sample: abcdA) is among hits from
                                the template database; also insures that the
                                input template remains after clustering.
      --template_option         [ALL], TOP, TEMPLATE,TEMPLATE_FAST. If --template is given, 
                                models [ALL] hits, TOP models all hits that have no more than 20% 
                                lower sequence identity than the highest sequence identity
                                template (per region), or only the hits for the TEMPLATE.
                                A faster option if TEMPLATE_FAST, where only the profile
                                of the input template is used, thus effectively disabling 
                                meaningful evalue statistics. 
                                For the fast option, the shortened PDB95 files should be
                                specified in the configuration file. The option just
                                disables the computation of statistics. 
      --score_by_tsvmod         [ON],OFF (in case tsvmod is not installed)
      --final_models_by		If present, performs model-gathering (.fin file) in
                                sequence directory. (MPQS,DOPE,INPUT_TEMPLATE,LONGEST_GA341,
                                SEQID,LONGEST_DOPE,GA341,TSVMOD)       
EOF
}
