# This file is part of ModPipe, Copyright 1997-2010 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

package PLLib::Modeller;
require Exporter;
@ISA    = qw(Exporter);
@EXPORT = qw( RunModeller 
              WRTTOP_RWALI ParseIDMAT
              SeqIdFromAli CountModels CurrentEnergies GA341Scores 
              ParseSuperposeLog
              GetNativeOvlp SelectBestModel OpenModIOFiles 
              GetSeqFromPDB GetPDBXREF
              );

use strict;
use PLLib::Utils;
use File::Basename;
use PLLib::ModPy;
use PLLib::Alignment;
use MPLib::Binaries;

# -- Reads a PDB file and returns an alignment
#    object containing the sequence found in that file
#    as seen by MODELLER.

sub GetSeqFromPDB {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 8;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($pdbrep, $pdbfile, $range_beg, $range_end, 
       $acode, $afile, $hetatm, $water) = @_;

   # -- Open I/O files for MODELLER
   my ($pynam, $lognam, $pyfh) = OpenModIOFiles(undef);

   # -- Open temporary file for output
   my $outfile = TMPName() . ".pir";

   # -- Write Python input script
   WRTPY_GETSEQFROMPDB($pyfh, $pdbrep, $pdbfile, $range_beg, $range_end,
                       $acode, $afile, $outfile, $hetatm, $water);

   # --- Close the python file
   close ( $pyfh );

   # -- Execute MODELLER
   RunModeller($pynam)
     or die "${subname}__E> MODELLER failed to write out sequence for $pdbfile\n"; 

   # -- Check for the presence of the output alignment file
   unless ( -e $outfile ){
      warn "${subname}__E> No output sequence file found: $outfile\n";
      return;
   }

   # -- Read in the alignment into an ali obj
   my $ali = undef;
   $ali = ReadAlignment($ali, $outfile, 'PIR', 'HORIZONTAL');

   # -- Return the alignment object 
   return $ali;
}


sub OpenModIOFiles {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my $runname = $_[0];

   # --- Check if a runname has been specified
   my ($pynam, $lognam);
   if ( $runname ne '' ){
      $pynam  = "${runname}.py";
      $lognam = "${runname}.log";
   } else {
        my $tmpname = TMPName();
        $pynam  = "${tmpname}.py";
        $lognam = "${tmpname}.log";
     }

   # --- Open the TOP file
   my $fhpy = OpenNewFile( $pynam );

   # -- Return files and handles
   return($pynam, $lognam, $fhpy);

}

sub SelectBestModel {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($modscr, $scrtyp) = @_;

   # -- Create a dummy array with all score names
   my @availscores = ['MOLPDF', 'DOPE', 'DOPEHR', 'GA341',
                      'COMPACT', 'ENEPAIR', 'ENESURF', 'ENECOMB',
                      'ZPAIR', 'ZSURF', 'ZCOMB', 'NORMDOPE'];

   # -- Get the index of the specified score from the array
   my $scridx = undef;
   for(my $i = 0; $i < @availscores; $i++){
     if ( $availscores[$i] =~ /^$scrtyp$/ ){
        $scridx = $i;
        last;
     }
   }

   # -- Fetch the sorted keys by specified score
   my @sorted_keys = ();
   @sorted_keys = sort { $modscr->{$a}->[$scridx] <=> 
                         $modscr->{$b}->[$scridx] } keys %$modscr;

   # -- Pop the best model
   my $bestmodel = shift @sorted_keys;

   # -- Return
   return $bestmodel;
}

sub GetNativeOvlp {

   use PLLib::PDBUtils;
   use PLLib::Sequence;
   use PLLib::Modeller;
   use PLLib::ModPy;

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 7;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($acodem, $modelf, $chnmod, $acoden, $nativef, $chnnat,
       $outfile) = @_;


   # -- Extract sequence from model
   my ( $aaseqm, $aaidxm) = undef;
   unless ( ( $aaseqm, $aaidxm ) = GetChnSeqIdx( $modelf, $chnmod )){
     warn "${subname}__E> Failed getting reference sequence: \"$chnmod\", $modelf\n";
     return;
   }

   # -- Extract sequence from native
   my ( $aaseqn, $aaidxn ) = undef;
   unless (( $aaseqn, $aaidxn ) = GetChnSeqIdx( $nativef, $chnnat )){
     warn "${subname}__E> Failed getting reference sequence: \"$chnnat\", $nativef\n";
     return;
   }

   # -- Get the beg/end of sequences
   my ($begm, $endm) = undef;
   unless (($begm, $endm) = FindSeqInSeq($aaseqm, $aaseqm)){
     warn "${subname}__E> Model sequence not found in PDB\n";
     return;
   }

   # -- Get the beg/end of sequences
   my ($begn, $endn) = undef;
   unless (($begn, $endn) = FindSeqInSeq($aaseqm, $aaseqn)){ 
     warn "${subname}__E> Model sequence not found in native\n";
     return;
   }

   # -- Superpose and get data
   # -- Write the MODELLER python script and execute it
   my $pybase = TMPName();
   my $pyname = "${pybase}.py";
   my $pylogf = "${pybase}.log";

   # -- Write the python file
   my $fhpy = OpenNewFile( $pyname );
   WRTPY_SUPERPOSEAV_NOALI($fhpy, $acodem, $modelf, 
                           $aaidxm->[$begm-1], $aaidxm->[$endm-1], 
                           $chnmod, $acoden, $nativef, $aaidxn->[$begn-1], 
                           $aaidxn->[$endn-1], $chnnat, $outfile);
   close( $fhpy );

   # -- Run MODELLER for the conversion
   unless ( RunModeller($pyname) ){
     warn "${subname}__E> MODELLER failed on script file: $pyname\n";
     return;
   }

   # -- Parse the relevant numbers out
   my $iter = 0;
   my ($alnmdl1, $selmdl1, $alnmdl2, 
       $geqvp, $grmsd, $dcut, $eqvp, $rmsd) = [];
   unless(($iter, $alnmdl1, $selmdl1, $alnmdl2, $geqvp, $grmsd,
       $dcut, $eqvp, $rmsd) = ParseSuperposeLog( $pylogf )){
     warn "${subname}__E> Could not parse logfile: $pylogf\n";
     return;
   }

   # -- Check if transformed coordinates exist
   if ( $outfile ne '' && ! -e $outfile ){
     warn "${subname}__E> Could not find file with transformed coordinates\n";
     return;
   }

   # -- Remove python/log files
#   unlink( $pyname, $pylogf);

   # -- Return values
   return ( $aaidxm->[$begm-1], $aaidxm->[$endm-1], 
            $aaidxn->[$begn-1], $aaidxn->[$endn-1],
            $iter, $alnmdl1, $selmdl1, $alnmdl2, $geqvp, $grmsd, 
            $dcut, $eqvp, $rmsd, $outfile);

}

sub ParseSuperposeLog {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ( $logfile ) = $_[0];

   # --- Check for log file
   unless ( -e $logfile ){
      warn "${subname}__E> Could not find file: $logfile\n";
      return;
   }

   # --- Open the log file
   my $fhlog = OpenFile( $logfile );

   # --- Parse the file for current energies
   my (@alnmdl1, @selmdl1, @alnmdl2, @geqvp, @grmsd) = (); 
   my (@dcut, @eqvp, @rmsd) = ();
   my $iter = 0;
   while ( my $line = <$fhlog> ){
      if ( $line =~ /^Least-squares superposition:/ ){
         $iter++;
         while ( $line = <$fhlog> ){
            chomp $line;

            # -- Store the number of residues in the alignment
            #    the global aligned positions & rmsd
            # -- Number of residues in MDL1
            push @alnmdl1, (split(" ", $line))[6]
              if ($line =~ /Numb of residues in MODEL /);

            # -- Number of selected atoms in MDL1
            push @selmdl1, (split(" ", $line))[8]
              if ($line =~ /Numb of atoms\/sel atoms in MODEL/);

            # -- Number of residues in MDL2
            push @alnmdl2, (split(" ", $line))[6]
              if ($line =~ /Numb of residues in MODEL2 /);

            # -- Number of aligned eqv pos
            push @geqvp, (split(" ", $line))[6]
              if ($line =~ /Numb of aligned equiv positions /);

            # -- Global rmsd
            push @grmsd, (split(" ", $line))[4]
               if ($line =~ /  RMS after superposition /);

            # -- Now gather relevant iformation for this cut-off
            push @dcut, (split(" ", $line))[3]
              if ( $line =~ /Distance cutoff / );

            push @eqvp, (split(" ", $line))[6]
              if ( $line =~ /Numb of equiv cutoff positions / );

            push @rmsd, (split(" ", $line))[5]
              if ( $line =~ /Cutoff RMS after superposition / );

            # -- Skip to next iteration
            last if ( $line =~ /^SUPERPOSITION OF/ );
         }
      }
   }

   # --- Close the file
   close ( $fhlog );

   # -- Return error
   return if ( $iter == 0 );
   return if ( $iter != scalar(@dcut) ||
               $iter != scalar(@eqvp) ||
               $iter != scalar(@rmsd) );

   # --- Return
   return ($iter, \@alnmdl1, \@selmdl1, \@alnmdl2, \@geqvp, \@grmsd, 
           \@dcut, \@eqvp, \@rmsd);
}


sub GA341Scores {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ( $logfile ) = $_[0];

   # --- Check for log file
   unless ( -e $logfile ){
      warn "${subname}__E> Could not find file: $logfile\n";
      return;
   }

   # --- Open the log file
   my $fhlog = OpenFile( $logfile );

   # --- Parse the file for current energies
   my @ga341 = ();
   while ( my $line = <$fhlog> ){
      if ( $line =~ /^GA341 score/ ){
         push @ga341, (split(" ", $line))[3];
      }
   }

   # --- Close the file
   close ( $fhlog );

   # --- Return
   return @ga341;
}

sub CurrentEnergies {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my $logfile  = $_[0];

   # --- Check for log file
   unless ( -e $logfile ){
      warn "${subname}__E> Could not find file: $logfile\n";
      return;
   }

   # --- Open the log file
   my $fhlog = OpenFile( $logfile );

   # --- Parse the file for current energies
   my %scores = ();
   while ( my $line = <$fhlog> ){
     next unless ( $line =~ /^SCR>/ );
     chomp $line;
     my (undef, $filename, @fields) = split ' ', $line;
     $scores{$filename} = [ @fields ];
     last if ( $line =~ /^>> END OF SUMMARY/ );
   }

   # --- Close the file
   close ( $fhlog );

   # -- Verify that the hash has at least one value
   unless ( keys %scores ){
     warn "${subname}__E> No energies parsed!\n";
     return;
   }

   # --- Return
   return (\%scores);
}

sub CountModels {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 3;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ( $tgtcode, $begmod, $endmod ) = @_;

   my @modnames = ();
   foreach my $mod ($begmod .. $endmod){
   
      # --- Create the model filename
      my $modindx = sprintf "%04d", $mod;
      my $modname = "$tgtcode" . ".B9999" . $modindx . ".pdb";

      # --- Look for models
      push @modnames, $modname if ( -e $modname );
   }

   # --- Return array of names
   return @modnames;
}


sub SeqIdFromAli {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 3;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ( $alifile, $aliformat, $cleanup ) = @_;

   # --- See if alignment file exists
   unless ( -e $alifile ){
      warn "${subname}__E> Could not find alignment file: $alifile\n";
      return;
   }

   # --- Create temporary filenames
   my $temptag = TMPName();
   my $pyname  = "${temptag}.py";
   my $logname = "${temptag}.log";
   my $matname = "${temptag}.mat";

   # --- Open the TOP file (IDMAT)
   my $fhpy = OpenNewFile( $pyname );

   WRTPY_IDTABLE($fhpy, $alifile, $aliformat, $matname);

   # --- Close the TOP file (IDMAT)
   close ( $fhpy );

   # --- Run MODELLER for IDTABLE
   unless ( RunModeller($pyname) ){
      warn "${subname}__E> MODELLER failed on id_table: $pyname\n";
      warn "${subname}__E> Failed processing alignment for modeling:\n";
      warn "               Alignment file: $alifile\n";
      return;
   }

   # --- Make sure MODELLER wrote the final alignment
   unless ( -e $matname ){
      warn "Could not find output of MODELLER: $matname\n";
      return;
   }

   # --- Parse the ID_TABLE mat file to get all sequence identities
   my @seqids = ParseIDMAT($matname, 1);

   # --- Clean up files
   unlink( $pyname, $logname, $matname ) if ( $cleanup =~ /ON/i );

   # --- Return values
   return @seqids;
}


sub ParseIDMAT {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($idmatfile, $targetcol) = @_;

   # --- Check for matrix file
   unless ( -e $idmatfile ){
      warn "${subname}__E> Could not find matrix file: $idmatfile\n";
      return;
   }

   my @seqids = ();
   my $fhmat = OpenFile( $idmatfile );
   while ( my $matln = <$fhmat> ){
      next if (substr($matln, 6, 1) ne '@');
      chomp $matln;
      my $dist = substr($matln, 10);
      my $seqid = (split(" ", $dist))[$targetcol - 1];
      push @seqids, 100-$seqid;
   }
   close($fhmat);

   # --- Return
   return @seqids;
}


sub WRTTOP_RWALI {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 6;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($topfh, $pdbdir, $inpali, $inpformat,
       $outali, $outformat) = @_;

   # --- Create Python file with commands
   print $topfh "from modeller import *\n";
   print $topfh "env = environ()\n";
   print $topfh "env.io.atom_files_directory = '$pdbdir'\n";
   print $topfh "aln = alignment(env, file='$inpali',\n";
   print $topfh "                alignment_format='$inpformat',\n";
   print $topfh "                allow_alternates=True)\n";
   print $topfh "aln.write(file='$outali',\n";
   print $topfh "          alignment_format='$outformat')\n";

   # --- Return value
   return 1;
}


sub RunModeller {

use File::Basename;

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($topfname) = @_;

   my $mod = GetModeller();

   # --- Create name for log file
   my ($name,$dir,$ext) = fileparse($topfname,'\..*');
   my $logfname = "${name}.log";

   # --- Actually run MODELLER
   my $retval = system("$mod $topfname > $logfname");

   # --- Check return value
   if ($retval != 0) {
      print "${subname}__E> MODELLER failed: ";
      if ($? == -1) {
        print "$!\n";
      } elsif ($? & 127) {
        printf "died with signal %d\n", ($? & 127);
      } else {
        printf "exited with value %d\n", ($? >> 8);
      }
      return;
   }

   # --- Check for log file
   unless ( -s $logfname ){
      print "${subname}__E> MODELLER produced no (or empty) log file\n";
      return;
   }

   # --- Check for errors in the log file
   my ( $error_count, $warn_count, $version ) = undef;
   open(LOG, "< $logfname");
   while ( <LOG> ){
      if ( / MODELLER [0-9]/ ) { s/^\s+//; $version = $_ }
      if ( /_E>/ && ! /_E> Obj\. func\. \(/) { $error_count++ }
      if ( /_W>/ && ! /^io_data____W> Setting io\.atom_files_directory to/) {
        $warn_count++
      }
   }
   close(LOG);

   # --- Report error count
   if ( $error_count ){
      print "${subname}__E> MODELLER produced $error_count errors\n";
      return ;
   }

   # --- Report warnings
   if ( $warn_count ){
      print "${subname}__W> MODELLER produced $warn_count warnings\n";
   }

   return 1;
}
