#!/usr/bin/perl
# This file is part of ModPipe, Copyright 1997-2010 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

# Get matches from one iteration of profile.build() against full UniProt, and
# return full-length sequences.

# Component of template-based modelling / Leverage / Structure Impact

# Called by: TemplateBased.pl > WriteSGESI()

use Getopt::Long;
use File::Basename;
use File::Path;
use Cwd;
use strict;

# --- Load local modules
use PLLib::Utils;
use PLLib::Sequence;
use PLLib::Modeller;

use MPLib::Version;
use MPLib::Binaries;
use MPLib::MPInit;
use MPLib::MPUtils;
use MPLib::MPModules;
use MPLib::MPSelectHits;

# --- Get command line options.  
$Getopt::Long::ignorecase = 0;

my ( $seqid, $conffile, $help, $cleanup, $profile_update);
my ( $max_sequences);

GetOptions (
           "conf_file=s"		=>	\$conffile,
	   "sequence_id=s"		=>	\$seqid,
	   "clean_up=s"		        =>	\$cleanup,
	   "max_sequences=s"	        =>	\$max_sequences,
	   "profile_update=s"	        =>	\$profile_update,
	   "help"			=>	\$help,
           "version"                    =>      sub { VersionMessage() },
           );

# --- Check command line options
if ( $help ){
   &usage;
   exit 0;
}
if (!$profile_update) {
   $profile_update="OFF";
}
if (!$max_sequences) {
   $max_sequences=30000;
}

# --- Get Program name
my $subrname = GetSubrName();

# --- Check for configuration file
die "${subrname}__E> Cannot proceed without configuration file\n"
   unless ( $conffile && -e $conffile );

# --- Check for sequence md5 hash
die "${subrname}__E> Cannot proceed without sequence id\n"
   unless ( $seqid );
# DKTEMP
print "[GetFullSeqs] seqid: $seqid\n";

# --- Read in the configuration file
die "${subrname}__E> Failed initializing configuration\n"
   unless ( ModPipeInit( $conffile ) );

# --- Check if sequence exists
my $seqnam = SeqFileMP( $seqid );
die "${subrname}__E> Sequence file not found: $seqnam\n"
   unless ( -e $seqnam );

# --- Create the temporary directory for local stuff
my $tmpdir = "${init::tmpdir}/${seqid}";

MakeDirSys($tmpdir) ||
   die "${subrname}__E> Could not create temporary directory: $tmpdir: $!";

# --- Set default values
$cleanup    = 'ON'   unless ( $cleanup );


# --- Store the current directory
my $currdir = cwd();

# --- Move into temporary directory
chdir( $tmpdir );

# -----------------------
# Begin main calculations
# -----------------------

# --- Use uniprot90-based profile/alignment to find hits (one iteration) in
# full UniProt.  

   # --- "Update" of existing profile, so copy uniprot90 profile to
   #     uniprot.  ("uniprot90" is the default given in the conf file.)

   my $prfnam90 = PrfFileMP( $seqid );
   my $prfnam   = PrfFileMP( $seqid, "uniprot" );
   my $command = "cp $prfnam90 $prfnam";
   # DKTEMP
   print "[GetFullSeqs] $command\n";
   if ( system( $command ) ) {
      die "${subrname}__E> Could not copy profile $prfnam90 to $prfnam\n"
   }

   # --- Check that full UniProt file given in conf file
   die "${subrname}__E> FULLSEQDB not given in conf file\n"
      unless ( defined( $init::fullseqdb ) );

   my $seqformat = 'PROFILE';
   my $niter = 1;
   my $fullseqdb = $init::fullseqdb;
   unless ( GetProfile({ 'seqid'     => $seqid, 
                         'seqformat' => $seqformat, 
                         'niter'     => $niter,
                         'seqdb'     => $fullseqdb,
                         'nrdbtag'   => 'uniprot',
                         'prfupdate' => $profile_update,
                       }) ) {
      warn "${subrname}__E> Failed to calculate BUILD_PROFILE profile\n";
      warn "${subrname}__E>    Sequence: $seqid\n";
      exit 1;
   }



# --- For each sequence in full-UniProt profile/alignment, retrieve full-length
#     sequence from UniProt.

   # --- Extract sequence codes from profile/alignment and write to a file for
   #     input to GetSequences.pl.  

   my @seqcodes=GetProfileSequenceCodes( $seqid ,$max_sequences);

   # --- Save sequence codes in modpipe data structure
   my $seqdir=SeqDirMP($seqid);
   unless ( CopyFile("profile_seqcodes", "${seqdir}/profile_seqcodes") ){
      warn "${subrname}__E> Failed copying sequence codes to $seqdir\n";
      warn "${subrname}__E>    Source File: profile_seqcodes\n";
      warn "${subrname}__E>    Target Dir : ${seqdir}\n";
      return;
   }

   # --- Retrieve full-length sequences.

   # --- Create the options hash for GetSequences.pl.
   #     Assume the FASTA flat-file database is same name, but without ".hdf5"
   #     extension.
   my $fullseqdb_fasta = $fullseqdb;
   $fullseqdb_fasta =~ s/\.hdf5$//;

   #     Local output - named by $seqid.
   my $local_fullsequences_fasta = "${seqid}-profile_sequences.fsa";

   warn "${subrname}__M> Extracting full-length sequences...\n";
   ExtractSequences($fullseqdb_fasta, $local_fullsequences_fasta, 'FASTA',
                    \@seqcodes);

   # --- Save full-length sequences in Modpipe data structure
   my $fullsequences_fasta = "${seqdir}/$local_fullsequences_fasta";
   unless ( CopyFile($local_fullsequences_fasta, $fullsequences_fasta) ){
      warn "${subrname}__E> Failed copying full sequences to Modpipe data structure\n";
      warn "${subrname}__E>    Source File: $local_fullsequences_fasta\n";
      warn "${subrname}__E>    Target Dir : ${seqdir}\n";
      return;
   }


# --- Move back to original directory
chdir($currdir);

# --- Clean up temporary directory
CleanupTMP($tmpdir) if ( $cleanup =~ /\bON\b/i );

# --- Exit finally
exit 0;


# ------------------------------------------------------------------------------
# Extract sequence codes from profile/alignment and write to a file for input to
# GetSequences.pl.  Also return in array.
# Code borrowed/modified from PLLib/ModProfile.pm.
sub GetProfileSequenceCodes {

   # --- Get subroutine name
   my $subrname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subrname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ( $seqid ) = shift @_;
   my ( $max_sequences ) = shift @_;

   my $prfnam = PrfFileMP( $seqid, "uniprot" );
   my $prf = undef;
   my $seq_identity_pct;
   my $scode;

   my $FH_PRF    = OpenFile($prfnam);
   my $FH_SCODES = OpenNewFile("profile_seqcodes");

   my @fields;
   my @scodes = ();
   my $i_seq = 0;

   # --- Skip first six (info) lines, and target line.
   for (my $i=0; $i<7; $i++) {
      <$FH_PRF>;
   }
   my ($i);
   while (( $prf = <$FH_PRF> ) && ($i<=$max_sequences)){
      chomp $prf;
      $i++;

      $prf=~s/^\s+//;
      @fields = split(/\s+/, $prf);

      # --- Extract the sequence code
      $scode = $fields[1];

      # --- Write to file
      print $FH_SCODES $scode . "\n";

      # ---Add to array
      push(@scodes, $scode);

      $i_seq++;
   }
   close($FH_PRF);
   close($FH_SCODES);

   return @scodes;
}

# ------------------------------------------------------------------------------
# --- Usage
sub usage {
print <<EOF;

${0}:

Takes a modpipe profile (created using build-profile) as input, creates a new profile 
and writes out a fasta file with all sequences in the profile. 

This is for example used to extend a uniprot-90 profile to a full uniprot profile, and then 
harvest all sequences from that profile. 

The usual modpipe filename conventions are used.  FULLSEQDB  needs to be present in the 
configuration file. Output files include a list of database ids for the resulting profile, 
and a fasta file with the full-length sequences of the sequences in the profile, both
files in the MP directory structure sequence directory. 

     --conf_file		ModPipe configuration file. Cannot proceed 
				without this option.

     --sequence_id		Sequence Id. This is the MD5 digest of the 
   				sequence that has been added to the ModPipe 
				filesystem. Cannot proceed without this 
				option.
     --profile_update		[OFF], ON
     --max_sequences		Maximum number of sequences harvested from profile
				Default: no restriction

     --clean_up			Flag to clean up the temporary directory
				after all operations. Can be OFF or ON.
				Default: ON

     --help                     This help. Pipe it through 'more' if it
                                scrolls off the screen.
EOF
}
