#!/usr/bin/perl
# This file is part of ModPipe, Copyright 1997-2014 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

use strict;
use Getopt::Long;
use File::Basename;

use MPLib::Version;
use MPLib::Binaries;
use PLLib::Utils;
use PLLib::NCBIUtils;

my ($seqfile, $blsdb, $seliter, $pb_j,
    $pb_h, $pb_e, $pb_o, $pb_C, $pb_Q,
    $pb_F, $pb_a, $pb_v, $pb_b, $pb_s,
    $pb_t, $help, $outfmt, $outfile );

# -- Get command line options
GetOptions (
           "query_sequence=s"   => \$seqfile,
           "blast_database=s"   => \$blsdb,
           "output_filename=s"  => \$outfile,
           "output_format=s"    => \$outfmt,
           "select_iteration=s" => \$seliter,
           "psiblast_j=s"       => \$pb_j,
           "psiblast_h=s"       => \$pb_h,
           "psiblast_e=s"       => \$pb_e,
           "psiblast_o=s"       => \$pb_o,
           "psiblast_C=s"       => \$pb_C,
           "psiblast_Q=s"       => \$pb_Q,
           "psiblast_F=s"       => \$pb_F,
           "psiblast_a=s"       => \$pb_a,
           "psiblast_v=s"       => \$pb_v,
           "psiblast_b=s"       => \$pb_b,
           "psiblast_s=s"       => \$pb_s,
           "psiblast_t=s"       => \$pb_t,
           "help"               => \$help,
           "version"            => sub { VersionMessage() },
           );

# -- Print usage information
if ( $help ) {
   &usage;
   exit 0;
}

# --- Get Program name
my $subrname = GetSubrName();

# -- Check input arguments
unless ( $seqfile && $blsdb ){
   print "${subrname}__E> Input should be at least query sequence and blast database file\n\n";
   die "Try $subrname --help for usage information\n";
}

# -- Set the various default parameters
$outfile = $outfile ? $outfile : fileparse($seqfile, '\..*') . ".mul";
$outfmt  = 'PIR' unless ( $outfmt =~ /^FASTA/i);
$seliter = 'ALL' unless ( $seliter );
$pb_o    = $pb_o ? $pb_o : TMPName();

$pb_a    = $pb_a ? $pb_a : 1;
$pb_j    = $pb_j ? $pb_j : 5;

$pb_h    = $pb_h ? $pb_h : 0.0001;
$pb_e    = $pb_e ? $pb_e : 0.0001;

$pb_b    = $pb_b ? $pb_b : 20000;
$pb_v    = $pb_v ? $pb_v : 20000;

$pb_t    = ( $pb_t =~ /^FALSE/i ) ? 'F' : 'T';
$pb_s    = ( $pb_s =~ /^FALSE/i ) ? 'F' : 'T';
$pb_F    = ( $pb_F =~ /^TRUE/i  ) ? 'T' : 'F';

# --- Check for existence of the input file
die "${subrname}__E> Could not find query sequence file: $seqfile\n"
   unless ( -e $seqfile );

# -- Check for the existence of blast databases
die "${subrname}__E> Could not find required blast databases\n"
   unless ( CheckBlastDB( $blsdb ) );

# -- Check for iteration count and .chk file request
if ( $pb_C && $pb_j < 2 ){
  warn "${subrname}__E> Cannot store PSI-Blast checkpoint file\n";
  warn "${subrname}__E>   when requested number of iterations is\n";
  die  "${subrname}__E>   is less than 2\n";
}

# -- Check if PSI-Blast executable exists
my $pbexe = GetBlast("blastpgp");
die "${subrname}__E> Could not find PSI-Blast executable: $pbexe\n"
   unless ( -e $pbexe );

# -- Prepare the options hash for PSI-Blast
my %pbopt = (
            "-i" => $seqfile,
            "-d" => $blsdb,
            "-j" => $pb_j,
            "-h" => $pb_h,
            "-e" => $pb_e,
            "-o" => $pb_o,
            "-F" => $pb_F,
            "-a" => $pb_a,
            "-v" => $pb_v,
            "-b" => $pb_b,
            "-s" => $pb_s,
            "-t" => $pb_t,
            );

# -- Set other options as required
$pbopt{"-C"} = $pb_C if ( $pb_C );
$pbopt{"-Q"} = $pb_Q if ( $pb_Q );

# -- Run the Blast executable 
die "${subrname}__E> Failed executing PSI-Blast\n"
   unless ( ExecNCBI($pbexe, \%pbopt) );

# -- Parse the blast output
PsiBlastToMultAln($seqfile, $pb_o, $seliter, $outfile, $outfmt)
  or die "${subrname}__E> Failed parsing PSI-Blast output: $pb_o\n";

exit 0;

# --- Usage
sub usage {
print <<EOF;
${0}:

      --query_sequence          Input sequence file. Should be in
                                FASTA format. Will not proceed without
                                this option.

      --blast_database          Name of the blast database to use for
                                scaning. Will not proceed without this
                                option.

      --output_filename         Name of the output file that will contain
                                the assembled multiple sequence alignment.
                                Default: <basename-of-query>.mul

      --output_format           Format of the output assembled multiple 
                                sequence alignment. Can be PIR or FASTA.
                                Default: PIR

      --select_iteration        Iteration to select when assembling the
                                multiple sequence alignment. Will assume 
                                'last' if selected iteration is not present
                                in the PSI-Blast output file.  Values can be 
                                'first', 'last', 'all' or a specific number.
                                Default: ALL

      --psiblast_j	        PSI-Blast option: the number of iterations
                                to scan database.
                                Default: 5

      --psiblast_h	        PSI-Blast option: E-value threshold for
                                inclusion in profile.
                                Default: 0.0001

      --psiblast_e	        PSI-Blast option: E-value threshold for
                                reporting significant hits.
                                Default: 0.0001

      --psiblast_o	        PSI-Blast option: File name to store the
                                output alignments.
                                Default: NONE

      --psiblast_C	        PSI-Blast option: File name to store the
                                binary output checkpoint file.
                                Default: NONE

      --psiblast_Q	        PSI-Blast option: File name to store the
                                ascii output matrix file.
                                Default: NONE

      --psiblast_F	        PSI-Blast option: Filter query sequence
                                for low complexity using SEG.
                                Default: False

      --psiblast_a	        PSI-Blast option: Number of processors
                                to use for calculation
                                Default: 1

      --psiblast_v	        PSI-Blast option: Number of database sequences
                                to show one-line descriptions for.
                                Default: 2000

      --psiblast_b	        PSI-Blast option: Number of database sequences
                                to show alignments for.
                                Default: 2000

      --psiblast_s	        PSI-Blast option: Compute locally optimal
                                Smith-Waterman alignments.
                                Default: True

      --psiblast_t	        PSI-Blast option: Use composition based
                                statistics.
                                Default: True

      --version                 Report version number of this program.

      --help                    This help.

EOF
}
