# This file is part of ModPipe, Copyright 1997-2010 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

package PLLib::NCBIUtils;
require Exporter;
@ISA    = qw(Exporter);
@EXPORT = qw( CheckBlastDB ExecNCBI PsiBlastToMultAln GetBlastHits);
             

use strict;
use PLLib::Utils;
use PLLib::Sequence;

sub GetBlastHits {

# This routine parses the output from a vanilla blast run

  #--- Get subroutine name
  my $subname = GetSubrName();

  #--- Check arguments
  my $nargs = 1;

  unless (scalar(@_) == $nargs){
     print "${subname}__E> Insufficient arguments\n" ;
     return;
  }

  #-- reassign the variables
  my ( $bl_o ) = $_[0];

  # -- Open the blast output file
  my $fh_blo = OpenFile( $bl_o );

  # -- Initialize variables
  my ($id, $slen, $score, $evalue, $identity,
      $querystart, $sbjctstart, $qseq, $sseq, $qstartnum,
      $qstarttmp, $qseqtmp, $qendnum, $sstartnum, $sstarttmp,
      $sendnum, $naln, $sseqtmp) = undef;

  my @lines = (); my $iter = 0;
  my $blhits = [];

  # -- Start Parsing Psi-Blast output file
  while (my $line = <$fh_blo>){

     while ( $line =~ /^>/ ){
     chomp $line; push (@lines, $line);

       # -- continue reading thro' same filehandle 
       # until next sequence or end of file occurs
       while ($line = <$fh_blo>){
         chomp $line;
         last if ( $line =~ /^>/ || 
                   $line =~ /Database/);
         push (@lines, $line);
       }

       # -- once you are here you have all alignments 
       #    of one sequence in array @lines; all manipulations 
       #    done here with ONLY @lines
       my $index = 0;
       while ($index <= $#lines){

         # -- Parse sequence identifier
         if ( $lines[$index] =~ /^>/ ){
           my @F = split (/[\|\s+]/, $lines[$index]);
           $F[0] =~ s/^>//;
           $id = ( $F[0] =~ /gi/ || $F[0] =~ /gb/ ||
                   $F[0] =~ /emb/ || $F[0] =~ /pir/ ||
                   $F[0] =~ /sp/ || $F[0] =~ /pdb/ ||
                   $F[0] =~ /gnl/ || $F[0] =~ /ref/ ||
                   $F[0] =~ /lcl/) ? $F[1] : $F[0];
         }

         # -- Parse for length of database sequence
         $slen = (split(" ",$lines[$index]))[2]
            if ($lines[$index] =~ /\s+Length = /);

         # -- Parse each segment of alignment for Scores
         if ($lines[$index] =~ /\s+Score = /){
            $evalue = (split(" ",$lines[$index]))[7];

            for (my $i = $index + 1; $i <= $#lines; $i++){
   
               # -- Parse for identities
               if ($lines[$i] =~ /\s+Identities/){
                 $identity = (split(" ", $lines[$i]))[3];
                 $identity =~ s/[(%),]//g;

                 $naln = (split(" ", $lines[$i]))[2];
                 $naln = (split(/\//, $naln))[1];
 
                 $querystart = 0;
                 $sbjctstart = 0;
                 undef $qseq;
                 undef $sseq;
               }

               # -- Parse query sequence from alignment
               if ($lines[$i] =~ /^Query:/){

                  # -- Store starting number only if its first occurence
                  if ($querystart == 0) {
                     $qstartnum = (split(" ", $lines[$i]))[1]; 
                     $querystart = 1;
                  }
 
                  ($qstarttmp,$qseqtmp, $qendnum) = 
                      (split(" ", $lines[$i]))[1,2,3];
                  $qseq = $qseq . $qseqtmp;
               }

               # -- Parse hit sequence from alignment
               if ($lines[$i] =~ /^Sbjct:/){

                 # -- Store starting number only if its the first occurence
                 if ($sbjctstart == 0) {
                    $sstartnum = (split(" ", $lines[$i]))[1]; 
                    $sbjctstart = 1;
                 }
 
                 ($sstarttmp, $sseqtmp, $sendnum) = 
                     (split(" ", $lines[$i]))[1,2,3];
                 $sseq = $sseq . $sseqtmp;
               }

               # -- this will parse only the first alignment per region.
               #    refer to the original ParseBlast script to modify.
               $index = $i if ($lines[$i] =~ /\s+Score =/ || $i == $#lines);
            }

            # -- Insert parsed values into a hash
            push @{ $blhits }, { "dbid"     => $id,
                                 "length"   => $slen,
                                 "evalue"   => $evalue,
                                 "naln"     => $naln,
                                 "identity" => $identity,
                                 "qrange"   => [ $qstartnum, $qendnum ],
                                 "srange"   => [ $sstartnum, $sendnum ],
                                };

         }

         $index++;
       }

       # -- end-of-sequence start with next
       undef @lines;
     }
  }

  # -- Close psi-blast output file
  close($fh_blo);

  # -- Return
  return $blhits;
}


sub PsiBlastToMultAln {

  #--- Get subroutine name
  my $subname = GetSubrName();

  #--- Check arguments
  my $nargs = 5;

  unless (scalar(@_) == $nargs){
     print "${subname}__E> Insufficient arguments\n" ;
     return;
  }

  #-- reassign the variables
  my ( $seqfile, $pb_o, $seliter, $outfile, $outfmt ) = @_;

  # -- Open and read in the query sequence (FASTA)
  my $seq; my $fh_que = OpenFile( $seqfile );
  unless ( defined( $seq = ReadNextSeq($fh_que, 'FASTA')) ){
    warn "${subname}__E> Failed reading query sequence:\n";
    warn "${subname}__E>   File: $seqfile\n";
    return;
  }

  # -- Parse the original sequence
  my ( $o_id, $o_seq ) = (ParseSeq($seq, 'FASTA', 'OFF'))[0,11];
  my $olen = length($o_seq);
  close($fh_que);

  # -- Open the output file and write out the query sequence
  my $fh_out = OpenNewFile( $outfile );
  my $outseq = FormatSeq($o_id, 'sequence', ' ', ' ', ' ',
                         ' ', ' ', $o_id, 'Query Sequence', ' ',
                         ' ', $o_seq, 75, $outfmt);
  print $fh_out $outseq;

  # -- Open the psi-blast output file
  my $fh_pbo = OpenFile( $pb_o );

  # -- Check number of iterations in file
  my $iter = 0;
  while(<$fh_pbo>){
    $iter++ if ( /^Sequences producing/ );
  }

  # -- Check if specified iteration exists in file
  my $myiter;
  if ( $seliter =~ /^FIRST$/i ){
    $myiter = 1;
  } elsif ( $seliter =~ /^LAST$/i ){
      $myiter = $iter;
  } elsif ( $seliter > 0 && $seliter <= $iter ){
      $myiter = $seliter;
  } else { $myiter = 0 }

  # -- Rewind the blast-output file
  seek($fh_pbo, 0, 0);

  # -- Initialize variables
  my ($id, $slen, $score, $evalue, $identity, $positives,
      $querystart, $sbjctstart, $qseq, $sseq, $qstartnum,
      $qstarttmp, $qseqtmp, $qendnum, $sstartnum, $sstarttmp,
      $sendnum, $naln, $n, $padleft, $padrite, $i, $sseqtmp) = undef;
  my @inc_ids = my @lines = (); my $iter = 0;

  # -- Start Parsing Psi-Blast output file
  while (my $line = <$fh_pbo>){

     # -- identify which iteration of psi-blast
     $iter++ if ( $line =~ /^Sequences producing/ );

     while ( $line =~ /^>/ ){
       chomp $line; push (@lines, $line);

       # -- continue reading thro' same filehandle 
       # until next sequence or end of iteration or 
       # end of file occurs
       while ($line = <$fh_pbo>){
         chomp $line;
         last if ( $line =~ /^>/ || 
                   $line =~ /^Searching/ || 
                   $line =~ /Database/);
         push (@lines, $line);
       }

       # -- once you are here you have all alignments 
       #    of one sequence in array @lines; all manipulations 
       #    done here with ONLY @lines
       my $index = 0;
       while ($index <= $#lines){

         # -- Parse sequence identifier
         if ( $lines[$index] =~ /^>/ ){
           $id = (split (" ", $lines[$index]))[0];
           $id =~ s/^>//;
         }

         # -- Parse for length of database sequence
         $slen = (split(" ",$lines[$index]))[2]
            if ($lines[$index] =~ /\s+Length = /);

         # -- Parse each segment of alignment for Scores
         if ($lines[$index] =~ /\s+Score = /){
            $evalue = (split(" ",$lines[$index]))[7];

            for ($i = $index + 1; $i <= $#lines; $i++){
   
               # -- Parse for identities
               if ($lines[$i] =~ /\s+Identities/){
                 $identity = (split(" ", $lines[$i]))[3];
                 $identity =~ s/[(%),]//g;
 
                 $querystart = 0;
                 $sbjctstart = 0;
                 undef $qseq;
                 undef $sseq;
               }

               # -- Parse query sequence from alignment
               if ($lines[$i] =~ /^Query:/){

                  # -- Store starting number only if its first occurence
                  if ($querystart == 0) {
                     $qstartnum = (split(" ", $lines[$i]))[1]; 
           	    $querystart = 1;
                  }
 
                  ($qstarttmp,$qseqtmp, $qendnum) = 
                      (split(" ", $lines[$i]))[1,2,3];
                  $qseq = $qseq . $qseqtmp;
               }

               # -- Parse hit sequence from alignment
               if ($lines[$i] =~ /^Sbjct:/){

                 # -- Store starting number only if its the first occurence
                 if ($sbjctstart == 0) {
                    $sstartnum = (split(" ", $lines[$i]))[1]; 
                    $sbjctstart = 1;
                 }
 
                 ($sstarttmp, $sseqtmp, $sendnum) = 
                     (split(" ", $lines[$i]))[1,2,3];
                 $sseq = $sseq . $sseqtmp;
               }

               # -- this will parse only the first alignment per region.
               #    refer to the original ParseBlast script to modify.
               $index = $i if ($lines[$i] =~ /\s+Score =/ || $i == $#lines);
            }

            my $qlen = length ($qseq);

            # -- Count the number of aligned positions
            $naln = 0; 
            for ($n = 0; $n < $qlen; $n++){
               $naln++ if ( substr($qseq, $n, 1) ne '-' && 
                            substr($sseq, $n, 1) ne '-' );
            }

            # -- Pad sequences with -'s to match length of original sequence
            $padleft = $qstartnum - 1;
            $padrite = $olen - $qendnum;
            $qseq =~ s/^/chr(45) x $padleft/e;
            $sseq =~ s/^/chr(45) x $padleft/e;
            $qseq =~ s/$/chr(45) x $padrite/e;
            $sseq =~ s/$/chr(45) x $padrite/e;

            # -- Delete regions where insertions occur in query 
            #    sequence of alignement
            for ($n = $qstartnum - 1; $n < $qendnum; $n++){
              if ( substr($qseq, $n, 1) eq "-" ){
                 substr ($qseq, $n, 1) = "";
                 substr ($sseq, $n, 1) = "";
                 $n--;
              }
            }

            # -- If sequence in requested iteration, print
            my $idchk=0;
            if (grep {$_ eq $id} @inc_ids) {
                $idchk=1;
            }

            if ( ($myiter == 0 || $myiter == $iter ) &&
                 $idchk == 0 ){
               push @inc_ids, $id;
               my $outseq = FormatSeq($id, 'sequence', ' ', 
                                      $sstartnum, ' ', $sendnum, ' ', 
                                      $id, "Psi-Blast Iteration $iter", 
                                      $evalue, $identity, $sseq, 75, $outfmt);
               print $fh_out $outseq;
            }

         }

         $index++;
       }

       # -- end-of-sequence start with next
       undef @lines;
     }
  }

  # -- Close psi-blast output file
  close($fh_pbo);
  close($fh_out);

  # -- Return
  return 1;
}

sub ExecNCBI {

  #--- Get subroutine name
  my $subname = GetSubrName();

  #--- Check arguments
  my $nargs = 2;

  unless (scalar(@_) == $nargs){
     print "${subname}__E> Insufficient arguments\n" ;
     return;
  }

  #-- reassign the variables
  my ( $ncbiexe, $options ) = @_;

  # -- Check for the NCBI executable
  unless ( -x $ncbiexe ){
    warn "${subname}__E> Could not find NCBI executable: $ncbiexe\n";
    return;
  }

  # -- Check if output file is specified
  unless ( exists($options->{"-o"}) ){
    warn "${subname}__E> Output file option (-o) not specified\n";
    return;
  }

  # -- Run the executable and report a success/failure
  my $retval = system("$ncbiexe @{ [ %$options ] }");

  # -- Check return value
  if ( $retval ){
    warn "${subname}__E> Failed executing NCBI program\n";
    warn "${subname}__E>   PROGRAM: $ncbiexe\n";
    warn "${subname}__E>   OPTIONS: ${[%$options]}\n";
    return;
  }

  # -- Check if output file exists
  my $outfile = $options->{"-o"};
  unless ( -s $outfile ){
    warn "${subname}__E> NCBI executable produced no (or empty) output file\n";
    return;
  }
  
  # -- Return
  return 1;
}

sub CheckBlastDB {

  use File::Basename;

  #--- Get subroutine name
  my $subname = GetSubrName();

  #--- Check arguments
  my $nargs = 1;

  unless (scalar(@_) == $nargs){
     print "${subname}__E> Insufficient arguments\n" ;
     return;
  }

  #-- reassign the variables
  my $blsdb = $_[0];

  # -- Check for the existence of blast databases
  my $blsdbdir = dirname($blsdb);
  foreach my $blsext ( qw( phr psq pin ) ){
   
    # -- Get the directory and filename
    my $blsdbnam = basename($blsdb);
    my $blsdb    = "${blsdbdir}/${blsdbnam}.${blsext}";
 
    # -- Check if file exists
    unless ( -e $blsdb ){
      warn "${subname}__E> Could not file blast database file:\n";
      warn "${subname}__E>    Directory: ${blsdbdir}\n";
      warn "${subname}__E>    Filename : ${blsdbnam}.${blsext}\n";
      return;
    }
  }

  # -- Return
  return 1;
}

#######################

