#!/usr/bin/perl
# This file is part of ModPipe, Copyright 1997-2010 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

use Getopt::Long;
use IO::File;
use File::Basename;
use strict;

# --- Load local modules
use MPLib::Version;
use MPLib::Binaries;
use PLLib::Utils;
use PLLib::Sequence;
use PLLib::Modeller;
use PLLib::ModProfile;
use PLLib::ModPy;
use PLLib::Alignment;
use PLLib::PDBUtils; 

my $default_pdb_repository = GetPDBRepository(0);

{
# --- Process command line options

$Getopt::Long::ignorecase = 0;

my ( $alnfile, $nummod, $begmod, $selmodby,
     $runname, $tgtcode, $moddir, $outfile,
     $modbase, $pdbrep, $cleanup, $help,
     $hetatm, $waters, $returntyp ) = undef;

GetOptions (
           "alignment_file=s"		=>      \$alnfile,
           "model_index_start=i"	=>	\$begmod,
           "number_of_models=i"		=>      \$nummod,
           "runname=s"			=>      \$runname,
           "target_code=s"		=>	\$tgtcode,
           "output_moddir=s"		=>	\$moddir,
           "outfile_name=s"		=>	\$outfile,
           "model_basename=s"		=>	\$modbase,
           "pdb_repository=s"		=>	\$pdbrep,
           "include_hetatm=s"		=>	\$hetatm,
           "include_waters=s"		=>	\$waters,
           "select_model_by=s"		=>	\$selmodby,
           "return:s"   		=>	\$returntyp,
           "clean_up=s"			=>	\$cleanup,
           "help"			=>      \$help,
           "version"                    => sub { VersionMessage() },
           );

# --- Output help
if ( $help ) {
   &usage;
   exit 0;
}

# --- Get program name
my $subrname = GetSubrName();

# --- Check command-line options
unless ( $alnfile && $tgtcode ){
   warn "${subrname}__E> Specify the input alignment file and target code\n";
   die "Try $subrname --help for usage information\n";
}

# --- Check if the alignment file list exists
unless ( -e $alnfile ){
   die "${subrname}__E> Could not find the alignment file: $alnfile\n";
}

# --- Set various default parameters
$begmod    = 1 unless ( $begmod );
$nummod    = 1 unless ( $nummod );
$runname   = "${tgtcode}-models" unless ( $runname );
$moddir    = "${tgtcode}-models" unless ( $moddir );
$outfile   = "${tgtcode}-models.out" unless ( $outfile );
$modbase   = "${tgtcode}-models" unless ( $modbase );
$pdbrep    = $default_pdb_repository unless ($pdbrep);
$cleanup   = 'ON' unless ( $cleanup );
$selmodby  = 'MOLPDF' unless ( $selmodby );
$returntyp = 'BEST' unless ( $returntyp );
$hetatm    = ( $hetatm =~ /\bON\b/i ) ? 'True' : 'False';
$waters    = ( $waters =~ /\bON\b/i ) ? 'True' : 'False';                        

# --- Initialize the cleanup array
my @cleanup_files = ();

# --- Check if the target_code exists in the alignment
die "${subrname}__E> Align code not in alignment: $tgtcode\n"
   unless ( CheckAlignCode($alnfile, 'PIR', $tgtcode) );

# --- Get the list of knowns 
my @knowns = ();
die "${subrname}__E> No structure entries in alignment: $alnfile\n"
   unless ( @knowns = StructureCodes( $alnfile ) );

# --- Get the template count
my $templatecnt = scalar( @knowns );

# --- Parse the alignment to get sequence identities
#     This assumes that the first sequence is the target
my @seqids = ();
die "${subrname}__E> Failed getting the seq. identities: $alnfile\n"
   unless ( @seqids = SeqIdFromAli($alnfile, 'PIR', 'ON'));

# --- Sort the seqids to get the highest (for model evaluation)
# --- First shift the array to remove the target line
shift @seqids;

# ---- Sort the remaining to get the highest
my @seqids_sorted = sort { $b <=> $a } @seqids;
my $hghseqid = shift @seqids_sorted;

# --- Report 
print "${subrname}__M> Alignment File  : $alnfile\n";
print "                No. of templates: $templatecnt\n";
print "                Highest Seqid   : $hghseqid\n";

# --- Create the PYTHON file names
my ($scrnam, $lognam, $fhpy) = OpenModIOFiles($runname);

# --- Decide the model index numbers
my $endmod = $begmod + $nummod - 1;

# -- Patch alignment and template pdb files for modeling with
#    ligands. As a first approximation, this simply handles the
#    ligand from the first template that has one. Ideally, this 
#    should be able to generate multiple alignments for each 
#    ligand. The overall strategy is this: (i) Choose the first
#    template that has a bound ligand that is within 6A of the
#    region that is aligned. This involves creating modified pdb
#    files in a new directory; (ii) patch the alignment file to
#    include as many dots as required for the target and the template
#    with ligand and gaps for the rest. Also, patch the residue
#    range in the PIR file to account for new hetatoms. Use Ben's
#    +x style for the residue range.
my $hetcnt = my $watcnt = 0;
if ( $hetatm eq 'True' || $waters eq 'True' ){

   # -- Read in the alignment into the ali data structure
   my $ali = undef;
   $ali = ReadAlignment($ali, $alnfile, 'PIR', 'HORIZONTAL');

   my $newrep = MakeLocalTMPDir();
   ($pdbrep, $hetcnt, $watcnt, $ali) = PrepAliForLigands($ali, $alnfile, 
                               $tgtcode, $pdbrep, $newrep, $hetatm, 6, 
                               $waters, 4, \@cleanup_files);

   # -- Update (write out) the alignment in the same file
   my $outali = WriteAlignment($ali);
   my $fhaln = OpenNewFile($alnfile);
   print $fhaln $outali;
   close ($fhaln);

   # -- Now identify all the inserted non-std residues in the modeled
   #    sequence and add special restraints to maintain the continuity
   #    of the protein chain. This is only for residues within the chain,
   #    like, carboxylated lysine, and not for things like metal atoms etc.
   
   # -- Find the index of the target sequence
   my $itgt = IndexInList($ali->code, $tgtcode);
   
   # -- Now scan the target sequence for positions of non-std residues
   #    This is approximate since it will quit if it encounters a /.
   my @hetpos = (); my $rescnt = 0;
   foreach my $p ( 0 .. length($ali->sequence->[$itgt]) - 1){
      my $res = substr($ali->sequence->[$itgt], $p, 1);
      $rescnt++ if ( ! IsGAP($res) );
      last if ( $res =~ /\// );
      next if ( IsGAP($res) || IsSTDAA($res) );
      push @hetpos, $rescnt ;
   }

   # -- Create special restraints for each additional residue
   #    and write the modeling script
   WRTPY_MODEL_WITH_SPBBRESTR($fhpy, $alnfile, \@knowns, $tgtcode, 
                              $pdbrep, $begmod, $endmod, 'ALL',
                              $hghseqid, $hetatm, $waters, \@hetpos);


}
  else {

    # --- Create the TOP file for modeling
    WRTPY_MODEL($fhpy, $alnfile, \@knowns, $tgtcode, 
                $pdbrep, $begmod, $endmod, 'ALL',
                $hghseqid, $hetatm);
  }

# --- Close the TOP file
close ( $fhpy );

# --- Update garbage list
push @cleanup_files, $scrnam, $lognam;

# --- Run MODELLER for BUILD_PROFILE
RunModeller($scrnam)
   or die "${subrname}__E> MODELLER failed on alignment: $alnfile\n";

# --- Check for modeller output
my @modelnames = CountModels( $tgtcode, $begmod, $endmod );
my $modelcnt   = scalar(@modelnames);

# --- Report count
print "${subrname}__M> No. of models built for alignment: $modelcnt\n";

# --- Get OBJ.FUNC.SCORE for all models
my $scores = ();
if ($modelcnt > 0) {
  die "${subrname}__E> Could not parse energies from log file\n"
     unless ( $scores = CurrentEnergies( $lognam ));
}

die "${subrname}__E> Number of models do not match the number of energies\n"
  unless ( keys %$scores == $modelcnt );

# --- Create output alignment directory
MakeDirSys($moddir) ||
   die "${subrname}__E> Could not create output directory: $moddir\n";

# --- Open the output file
my $fhout = OpenNewFile( $outfile );

if ($modelcnt > 0) {
  # --- Select models as per choice
  my $bestmodel = SelectBestModel( $scores, $selmodby ) or
     die "${subrname}__E> Failed during selection of best model\n";

  # -- Rename and move models
  if ( $returntyp =~ /^BEST$/ ){
     ProcessBestModel( $bestmodel, $scores, $hetcnt, $watcnt, $moddir, $fhout, $modbase ) or
        die "${subrname}__E> Failed processing best model: $bestmodel \n";
  } else {
     ProcessAllModels( \@modelnames, $scores, $hetcnt, $watcnt, $moddir, $fhout, $modbase ) or
        die "${subrname}__E> Failed processing models\n";
  }
}

# -- Update garabage list
push @cleanup_files, @modelnames, "*.rsr", "*.ini", "*.sch", "*.V9999*",
     "*.D0000*";

# --- Close output file stream
close( $fhout );

# --- Clean up files
if ( $cleanup =~ /ON/i ){
   foreach my $file ( @cleanup_files ){
      system("rm -rf $file")  if ( -e $file );
   }
}

# --- Exit finally
exit 0;
}

sub PrepAliForLigands {

   use List::Util qw( max );

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 10;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($ali, $alnfile, $tgtcod, $pdbrep, $newrep, $hetatm, $spherecut, 
       $waters, $watercut, $garbage)  = @_;

   my $hetcnt = my $watcnt = my $refidx = undef;
   foreach my $idx ( 0 .. $#{$ali->code} ){

      # -- Skip if entry is not a structure
      next if ($ali->type->[$idx] !~ /\bstructure/ );

      # -- Prepare some filenames
      my $newpdb = $ali->code->[$idx] . ".pdb";
      my $alibeg = join(':', $ali->inipos->[$idx], $ali->chain1->[$idx]);
      my $aliend = join(':', $ali->endpos->[$idx], $ali->chain2->[$idx]);

      # -- Prepare python file for selection
      my ($pynam, $lognam, $fhpy) = OpenModIOFiles(undef);
      WRTPY_SELFORLIGANDS($fhpy, $pdbrep, $ali->file->[$idx], $newrep, 
                          $newpdb, $alibeg, $aliend,
                          $spherecut, $watercut, $hetatm, $waters);

      # --- Close the python file
      close ( $fhpy );

      # --- Update garbage list
      push @{$garbage}, $pynam, $lognam, $newrep;

      # --- Run MODELLER to get selection
      RunModeller($pynam)
        or die "${subname}__E> MODELLER failed when creating selection: $alnfile\n" .
               "${subname}__E>      PDB Align Code:",  $ali->code->[$idx], "\n";

      # -- Check output file to see if hetatms/waters are present
      ($hetcnt,$watcnt) = (GetPDBData($newpdb, $newrep))[1,2];

      warn "${subname}__M> Found $hetcnt HET & $watcnt HOH residues in selection\n" .
           "${subname}__M>    Alignment file: $alnfile \n" .
           "${subname}__M>    Entry in alignment: ", $ali->code->[$idx], "\n" .
           "${subname}__M>    Will now patch the alignment ... \n";

      # -- if present, record the number of hetatms/waters & value of idx and exit loop
      #    Note that this will terminate when the first template with *either* of the
      #    conditions match. The rigorous solution when using multiple templates can
      #    get very hairy.
      if ( $hetcnt > 0 || $watcnt > 0){
         $ali->file->[$idx] = $newpdb;
         $refidx = $idx;
         last;
      }
   } 

   # -- Do the following stuff only if a het residue was found
   if ( $hetcnt > 0 || $watcnt > 0){

      # -- Include new directory into pdbrep
      $pdbrep = join(':', $newrep, $pdbrep);

      # -- This is a modification that is necessary to handle modified residues
      #    in the PDB files (like carboxylated lysine etc).
      
      # -- Prepare the residue range for the chosen template
      my $refpdb = basename($ali->file->[$refidx]);
      my $alibeg = join(':', $ali->inipos->[$refidx], $ali->chain1->[$refidx]);
      my $aliend = join(':', $ali->endpos->[$refidx], $ali->chain2->[$refidx]);

      # -- First go and fetch the sequence found in the PDB file as read by
      #    MODELLER
      my $pdb = GetSeqFromPDB($pdbrep, $refpdb, 'FIRST:@', 'END',
                              $ali->code->[$refidx], $refpdb, $hetatm, $waters);

      # -- This is a fix for PDB files that have HETs with a chain id. Modeller
      #    will include them in the sequence before the first chain break, but
      #    the protocol here waits for the first chain break to differentiate
      #    between intra-chain and extra-chain HETs. So the workaround for this
      #    is to scan the PDB sequence backwards and introduce a chain break
      #    just before the first standard aa type. There are a million reasons
      #    why this is dirty but here it is due to lack of a better solution.
      foreach my $p ( 1 .. length($pdb->sequence->[0]) ){
         my $pres = substr($pdb->sequence->[0], -($p), 1);
         my $nres = substr($pdb->sequence->[0], -($p+1), 1);
         
         if ( ! IsSTDAA($pres) && IsSTDAA($nres) ){
            substr($pdb->sequence->[0], -($p), 0) = '/';
            last;
         }
      }

      # -- Now compare the alignments in 'ali' with that in 'pdb' for the 
      #    reference sequence and patch appropriately.
      my $chk = 1; 
      while ( $chk ){
         my $match = 0;
         my $longer = max(length($ali->sequence->[$refidx]), length($pdb->sequence->[0]));
         my $chnbrk = 0;
         foreach my $p ( 0 .. $longer-1 ){

            # -- Get the alignment & PDB residue
            my $ares = substr($ali->sequence->[$refidx], $p, 1);
            my $pres = substr($pdb->sequence->[0], $p, 1);

            # -- Count the chain breaks so that you can differentiate
            #    b/w intra- and extra-chain HETs
            $chnbrk++ if ( $pres =~ /\// );

            # -- If it matches with the PDB sequence
            if ( $ares eq $pres ){
               $match++;
               next;
            }
              # -- If the alignment has a gap at this position,
              #    insert gap in in PDB sequence
              elsif ( IsGAP($ares) ){
                 substr($pdb->sequence->[0], $p, 0) = '-';
                 last;
              }
              # -- If the reference PDB sequence from MODELLER has a non-std
              #    aa type, that should be updated in the alignment
              #    with a gap in that position for all sequences except the
              #    template chain (Also note that this should be done only
              #    when the non-std residue is witihin the polypeptide chain
              #    and not when the nonstd residue is bound externally
              elsif ( ! IsSTDAA($pres) ){
                 # -- This is to take care of the problem of #s. The MODELLER restyp.lib
                 #    has more than one residue assigned to #. The general solution is
                 #    to assign a '.' to all hetatms, since they are treated rigid bodies
                 #    anyway. The PDB sequence has to be updated to prevent going into
                 #    into an infinite loop.
                 if ( $pres !~ /\// ){
                    $pres = '.';
                    substr($pdb->sequence->[0], $p, 1) = '.';
                 }

                 # -- Now deal with all other sequences.
                 foreach my $i ( 0 .. $#{ $ali->code } ){
                    # -- The target sequence
                    if ( $ali->code->[$i] eq $tgtcod ){
                       substr($ali->sequence->[$i], $p, 0) = ($chnbrk == 0 ) ? '-' : $pres;
                    }

                      # -- The template with ligand
                      elsif ( $ali->code->[$i] eq $ali->code->[$refidx] ){
                         substr($ali->sequence->[$i], $p, 0) = $pres; 
                      }

                      # -- For everything else
                      else {
                        substr($ali->sequence->[$i], $p, 0) = '-';
                      }
                 }
                 last;
              }
              else {
                 die "${subname}__E> This routine is unable to handle the differences in\n" .
                     "${subname}__E>   between the sequences found in the PDB file and alignment\n";
              }
         }

         $chk = 0 if ( $match == length($ali->sequence->[$refidx] ));
      }

      # -- Update the residue range field
      $ali->endpos->[$refidx] = $pdb->endpos->[0];

   } # -- End if hetatm was found

   # -- Return
   return ($pdbrep, $hetcnt, $watcnt, $ali);
}

sub ProcessBestModel {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 7;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($model, $scores, $hetcnt, $watcnt, $moddir, $fhout, $modbase)  = @_;

   # --- Get the extension from the model name
   my $modext = (fileparse($model, '\..*'))[2];

   # --- Create a new filename
   my $newname = ${modbase} . ${modext};

   # --- Move the alignement in to the new (final) directory
   my $modfile = "${moddir}/${newname}";
   unless ( CopyFile($model, $modfile) ){
      warn "${subname}__E> Could not copy model to final destination\n";
      warn "                Source: $model\n";
      warn "                Target: $modfile\n";
   }

   my $scrstr = 'MODELS: ';
   my $i = 1;
   $scrstr .= sprintf "%4d|%4d|%4d|", $i, $hetcnt, $watcnt;
   for(my $j = 0; $j <= scalar( @{ $scores->{$model} }); $j++){
     if ( $j <= 2 ){  # -- MOLPDF, DOPE*
        $scrstr .= sprintf "%12.4f|", $scores->{$model}->[$j];
     } elsif ($j == 3){ # -- GA341
        $scrstr .= sprintf "%5.2f|", $scores->{$model}->[$j];
       }
       elsif ($j == 4){ # -- COMPACTNESS
        $scrstr .= sprintf "%8.4f|", $scores->{$model}->[$j];
       }
       elsif ($j >= 5 && $j <= 7){ # -- ENERGY FROM PANCHO
        $scrstr .= sprintf "%12.4f|", $scores->{$model}->[$j];
       }
       elsif ($j >= 8 && $j <= 10){ # -- Z-SCORES FROM PANCHO
        $scrstr .= sprintf "%10.4f|", $scores->{$model}->[$j];
       }
       elsif ($j == 11){ # -- Z-SCORE FROM DOPE (NORMALIZED)
        $scrstr .= sprintf "%8.2f", $scores->{$model}->[$j];
       }
   }
   $scrstr .= sprintf " #%s\n", $modfile;
   printf $fhout $scrstr;

   # -- Return
   return 1;
}



sub ProcessAllModels {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 7;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($modelnames, $scores, $hetcnt, $watcnt, $moddir, $fhout, $modbase)  = @_;

   # --- Rename the models
   for(my $i = 0; $i < scalar(@$modelnames); $i++){
   
      # --- Get the extension from the model name
      my $model = $modelnames->[$i];
      my $modext = (fileparse($model, '\..*'))[2];

      # --- Create a new filename
      my $newname = ${modbase} . ${modext};

      # --- Move the alignement in to the new (final) directory
      my $modfile = "${moddir}/${newname}";
      unless ( CopyFile($model, $modfile) ){
         warn "${subname}__E> Could not copy model to final destination\n";
         warn "                Source: $model\n";
         warn "                Target: $modfile\n";
      }

      my $scrstr = 'MODELS: ';
      $scrstr .= sprintf "%4d|%4d|%4d|", $i+1, $hetcnt, $watcnt;
      for(my $j = 0; $j <= scalar( @{ $scores->{$model} }); $j++){
        if ( $j <= 2 ){  # -- MOLPDF, DOPE*
           $scrstr .= sprintf "%12.4f|", $scores->{$model}->[$j];
        } elsif ($j == 3){ # -- GA341
           $scrstr .= sprintf "%5.2f|", $scores->{$model}->[$j];
          }
          elsif ($j == 4){ # -- COMPACTNESS
           $scrstr .= sprintf "%8.4f|", $scores->{$model}->[$j];
          }
          elsif ($j >= 5 && $j <= 7){ # -- ENERGY FROM PANCHO
           $scrstr .= sprintf "%12.4f|", $scores->{$model}->[$j];
          }
          elsif ($j >= 8 && $j <= 10){ # -- Z-SCORES FROM PANCHO
           $scrstr .= sprintf "%10.4f|", $scores->{$model}->[$j];
          }
          elsif ($j == 11){ # -- Z-SCORE FROM DOPE (NORMALIZED)
           $scrstr .= sprintf "%8.2f", $scores->{$model}->[$j];
          }
      }
      $scrstr .= sprintf " #%s\n", $modfile;
      printf $fhout $scrstr;

   }

   # -- Return
   return 1;
}

# --- Usage
sub usage {
print <<EOF;
${0}:

      --alignment_file		Name of the alignment file.

      --model_index_start	Starting index number of the first model. Applicable
				only when building multiple alignments per alignment.
				Default: 1

      --number_of_models	Number of models to be generated per alignment.
				Default: 1

      --runname			Run name. This will be used to name the
				MODELLER TOP file.
				Default: <target_code>-models

      --target_code		Code of the target sequence in the alignment files.
				It will not proceed if this is missing or wrong.

      --output_moddir		Directory to store the models.
				Default: <target_code>-models-ali

      --outfile_name		File to store the details of the models.
				Default: <target_code>-models.out

      --model_basename		Basename for the model files. The
				actual model files will be based on this
				variable appended with a serial number.
                                Default: <target_code>-models

      --pdb_repository		The PDB repository to look for coordinate files.
                                Should exist if --set_assume_pdb = ON.
                                Default: $default_pdb_repository

      --include_hetatm		Include HETATM records in model. Use if specifying
                                BLK residues or such. Default: OFF

      --include_waters          Include WATER records in model.
                                Default: OFF

      --clean_up		Clean up all the files that this program produced
                                except the final alignments. Default: ON

      --select_model_by		Score to use to select the best model, if multiple
                                models were calculated for the given alignment. 
                                Allowed values are: MOLPDF, DOPE, DOPEHR, 
                                GA341, COMPACT, ENEPAIR, ENESURF, ENECOMB, ZPAIR,
                                ZSURF, ZCOMB, NORMDOPE.
                                Default: MOLPDF

      --return                  Whether to return the BEST model or ALL of them.
                                Default: BEST

      --version                 Report version number of this program.

      --help			This help. Pipe it through 'more' if it
				scrolls off the screen.
EOF
}
