#!/usr/bin/perl -w

use strict;
use Getopt::Long;
use File::Basename;
use File::Spec;
use Cwd;

# Ensure that END handlers are run if we are killed by a signal
use sigtrap qw(die INT TERM);

# --- Load local modules
use PLLib::Utils;
use PLLib::Sequence;
use PLLib::Modeller;

use MPLib::Version;
use MPLib::Binaries;
use MPLib::MPInit;
use MPLib::MPUtils;
use MPLib::MPModules;

# --- Get command line options
$Getopt::Long::ignorecase = 0;

my ( $seqfile, $exitstage, $hitsmode, $evaluehits, $help,
     $cleanup, $tmpdir, $datdir, $tmpdirroot,
     $runname, $rundir, $selmodby, $finfile, $runwhere, $running_sge_jobid);

my ( $logdir, $jobname, $modpipe, $disks, $nodes,$tsvmod_flag,
     $priority, $sgefile, $modfile, $hitfile, $template_conf);

my @final_modby = ();

GetOptions (
           "sequence_file=s"           =>      \$seqfile,
           "exit_stage=s"              =>      \$exitstage,
           "hits_mode=s"               =>      \$hitsmode,
           "evalue_hits=f"             =>      \$evaluehits,
           "score_by_tsvmod=s"         =>      \$tsvmod_flag,
           "clean_up=s"                =>      \$cleanup,
           "runname=s"                 =>      \$runname,
           "run_directory=s"           =>      \$rundir,
           "output_models_file=s"      =>      \$modfile,
           "output_hits_file=s"        =>      \$hitfile,
           "final_models_by=s"         =>      \@final_modby,
           "output_final_models=s"     =>      \$finfile,
           "run_where=s"               =>      \$runwhere,
           "help"                      =>      \$help,
           "version"                   => sub { VersionMessage() },

           "template_conf=s"           =>      \$template_conf,
           "tmpdir=s"                  =>      \$tmpdir,
           "tmpdirroot=s"              =>      \$tmpdirroot,
           "datdir=s"                  =>      \$datdir,

           "log_directory=s"      =>      \$logdir,
           "job_name=s"           =>      \$jobname,
           "modpipeclusterbase=s"            =>      \$modpipe,
           "disks=s"              =>      \$disks,
           "nodes=s"              =>      \$nodes,
           "priority=i"           =>      \$priority,
           "sge_file=s"           =>      \$sgefile,
           );

# --- Check command line options
if ( $help ){
   &usage;
   exit 0;
}

if (!$tsvmod_flag) {
    $tsvmod_flag="ON";
} else {
    $tsvmod_flag=uc($tsvmod_flag);
}
# --- Get Program name
my $subrname = GetSubrName();

# -- Check mandatory options
die "${subrname}__E> Cannot proceed without a file of sequences\n"
  unless ( $seqfile && -e $seqfile );

$runwhere = 'SGE' unless ( $runwhere );

if ($runwhere eq 'SGE') {
  unless ($disks && $nodes) {
    warn "${subrname}__E> Missing mandatory options: --disks and/or --nodes\n";
    die  "${subrname}__E>   Try --help for usage help\n";
  }
} elsif ($disks || $nodes) {
  warn "${subrname}__E> --disks and --nodes do not work with " .
       "--run_where='LOCAL'\n";
  die  "${subrname}__E>   Try --help for usage help\n";
}

# -- Set the default values
if ($modpipe) {
   $modpipe="{$modpipe}/main/ModPipe.pl";
} else {
   $modpipe = GetModPipeScript("main/ModPipe.pl");
}
push @final_modby, 'LONGEST_DOPE' unless ( @final_modby );
my $final_modby_str = join ",", @final_modby;
$final_modby_str =~ s/\s//g;
$exitstage  = 'NONE' unless ( $exitstage );
$hitsmode   = '111'  unless ( $hitsmode );
$evaluehits = 1.0    unless ( $evaluehits );
$runname    = "ModWeb20b-" . time unless ( $runname );
$cleanup    = 'ON'   unless ( $cleanup );
if ($template_conf) {
  $template_conf = File::Spec->rel2abs($template_conf);
} else {
  # Abuse of GetModPipeScript...
  $template_conf = GetModPipeScript("web/conf/template_modpipe.conf");
}

if (!$tmpdir) {
    if (!$tmpdirroot) {
        $tmpdir     = "/scratch/ModWeb20b/${runname}";
    } else {
        $tmpdir="$tmpdirroot/${runname}";
    }
}
   
$rundir     = cwd() . "/${runname}" unless ( $rundir );
$tmpdir = File::Spec->rel2abs($tmpdir);
$rundir = File::Spec->rel2abs($rundir);
$datdir     = "${rundir}/data" unless ( $datdir );
$datdir = File::Spec->rel2abs($datdir);

$logdir   = "${rundir}/sge-logs" unless ( $logdir );
$logdir = File::Spec->rel2abs($logdir);
$jobname  = 'Modweb20' unless ( $jobname );
$priority = -4 unless ( $priority );
$sgefile  = 'sge-modpipe.csh' unless ( $sgefile );

$modfile = "${runname}.mod" unless ( $modfile );
$hitfile = "${runname}.hit" unless ( $hitfile );
$finfile = "${runname}.fin" unless ( $finfile );

# -- Create the run directory
die "${subrname}__E> Failed making run directory: $rundir\n"
  unless ( CheckDir( $rundir ) );

# -- Create the data directory
die "${subrname}__E> Failed making data directory: $datdir\n"
  unless ( CheckDir( $datdir ) );

# --- Check if log directory exists
die "${subrname}__E> Could not create log directory: $logdir\n"
   unless ( CheckDir( $logdir ) );

# -- Print Time
printf "%s\n", GetTime();

# -- Re-assign seqfile name
my $newseqfile = "${rundir}/" . basename($seqfile);

# -- Copy the sequence file into rundir 
#    only if it does not exist already
unless ( -e $newseqfile ){
  unless( CopyFile($seqfile, $newseqfile) ){
    warn "${subrname}__E> Could not copy input sequences into run directory\n";
    warn "${subrname}__E> Run Directory: $rundir\n";
    die  "${subrname}__E> Sequence File: $seqfile\n";
  }
}

# -- Move into rundir
chdir($rundir);

# -- Re-assign sequence filename
$seqfile = basename($newseqfile);

# -- Open the configuration file
my $conffile = "modpipe.conf";
my $fh_conf = OpenNewFile( $conffile ) or
   die "${subrname}__E> Could not open file for configuration\n";

# -- Read the template ModPipe config file
my $template_hash = '';
die "${subrname}__E> Failed reading template ModPipe configuration\n"
   unless ( $template_hash = ReadConf($template_conf) );

# -- Override some entries in the template config file
$template_hash->{'DATDIR'} = $datdir;
$template_hash->{'TMPDIR'} = $tmpdir;

# -- Write the configuration file
WriteConf($fh_conf, $template_hash);
close($fh_conf);

# -- Add sequence(s) to repository
  # -- Create the options hash
  my %addseqopt = (
                  "--conf_file"         => $conffile,
                  "--sequence_file"     => $seqfile,
                  );

  # -- Call AddSeqMP to add sequences
  my $command = GetModPipeScript("main/AddSeqMP.py")
                . " @{[ %addseqopt ]} 2>&1";

  # -- Run Command
  if ( system( $command ) ){
     warn "${subrname}__E> Failed adding sequences to repository\n";
     warn "${subrname}__E> $command\n";
     die  "${subrname}__E> ... Will exit\n";
  }

  # -- Check for unq file
  my $unqfile = fileparse($seqfile, '\..*') . '.unq';
  die "${subrname}__E> Could not find file with unique MD5 ids: $unqfile\n"
     unless ( -e $unqfile );

  # -- Read in the MD5 ids
  my ($ids, $names) = ReadUNQ( $unqfile );
  printf "%s %8d\n", "${subrname}__M> No. of sequences added: ", scalar(@$ids);

# -- Submit job to the queue
  if ( $runwhere =~ /\bSGE\b/ ){
    # -- Create the SGE job file
    my $sgefh = OpenNewFile($sgefile);

    # -- Write SGE job file
    WriteSGEMP($sgefh, $logdir, $jobname, $modpipe, $disks, $nodes, $priority,
               $conffile, $exitstage, $hitsmode, $evaluehits, $cleanup, $tsvmod_flag, $final_modby_str,
               scalar(@$ids), $ids,$tmpdir);

    # -- Close the SGE job file handle
    close($sgefh);
  
    # -- Verify if qsub is in the path
    my $qsubexe = `which qsub 2> /dev/null` or
      die "${subrname}__E> Cannot find qsub in path\n";
    chomp $qsubexe;

    # -- Submit job and collect jobid
    warn "${subrname}__M> Submitting job to cluster\n";
    my $jobid = `qsub $sgefile`;
    chomp $jobid;
    $jobid = (split(" ",$jobid))[2];
    $jobid =~ s/\..*//;
    $running_sge_jobid = $jobid;
    warn "${subrname}__M> Job successfully submitted: $jobid\n";

    # -- wait for job completion and collect results
    while ( 1 ){
      # -- Sleep for a while
      sleep 120; 

      # -- Check status of job
      my $qstatout = `qstat -j $jobid 2>&1`;
      last if ($qstatout =~ /^Following jobs do not exist/);
    }
    # Ensure that END block does not try to qdel the finished job
    $running_sge_jobid = undef;

    # -- Post process data
    process_job_output($subrname, $logdir, $jobid);
  } else {
      # -- Run the job locally

      # -- Form the options hash for ModPipe
      my %mpopt = (
                    "--conf_file"       => $conffile,
                    "--hits_mode"       => $hitsmode,
                    "--exit_stage"      => $exitstage,
                    "--evalue_hits"     => $evaluehits,
                    "--score_by_tsvmod"     => $tsvmod_flag,
                    "--clean_up"        => $cleanup,
                    "--final_models_by"   => $final_modby_str,
                  );

      # -- Run the sequences one at a time
      foreach my $oneid ( @$ids ){
         # -- Fix the sequence id
         $mpopt{"--sequence_id"} = $oneid;
         $command = GetModPipeScript("main/ModPipe.pl")
                       . " @{[ %mpopt ]} 2>&1";

         # -- Run the command
         open(RUN, "$command |");
         while (<RUN>) {
           print;
         }
         close(RUN);
      }
  }

   # --- Reformat the final_modby array into a string to be
   #     passed to the GatherModMP script.

  # -- Create the options hash
  my %collectmod = (
                  "--conf_file"         => $conffile,
                  "--unq_file"          => $unqfile,
                  "--output_modfile"    => $modfile,
                  "--output_hitfile"    => $hitfile,
                  "--output_finfile"    => $finfile,
                  "--final_models_by"   => $final_modby_str,
                   );

  # -- Call GatherModMP to select models
  $command = GetModPipeScript("main/GatherModMP.py")
                . " @{[ %collectmod ]} 2>&1";

  # -- Run Command
  if ( system( $command ) ){
     warn "${subrname}__E> Failed to collect models from repository\n";
     warn "${subrname}__E> $command\n";
     die  "${subrname}__E> ... Will exit\n";
  }


# -- Print Time
printf "%s\n", GetTime();

#--- exit
exit 0;


END {
  # If an SGE job was submitted and is still running, delete it
  if (defined $running_sge_jobid) {
    system("qdel $running_sge_jobid");
  }
}


# -- Subroutines

# --- Usage
sub usage {
print <<EOF;

${0}:

ModWeb is essentially a script that will take a file with sequences
in FASTA format and run ModPipe on them. It will use the SGE queueing
system to submit the job to the cluster. So it is essential that this
script is run on a machine that is a SGE submit host.

Options:

      --version                 Report version number of this program.
      --help                    This help. Pipe it through 'more' if it
      --sequence_file           Input file with sequences in FASTA 
                                format.  Mandatory option.
      --runname                 A runname for the ModWeb run. 
                                Default: ModWeb0b-(time). A directory of this name
                                will be created in the current working path.
      --run_directory           The directory from where the jobs should be
                                run. All data will be written into this
                                directory.
                                Default: [PWD]/<runname>
      --output_models_file      Filename to store the data about the models
                                produced. Typically for loading into ModBase.
                                Default: <runname>.mod
      --output_hits_file        Filename to store the data about the hits
                                produced. Typically to figure how many hits were
                                found.  Default: <runname>.hit
      --final_models_by         Scheme to choose the final models by. Possible
                                values: LONGEST_DOPE, LONGEST_GA341, SEQID
                                GA341, DOPE, MPQS, TSVMOD, ALL.  Multiple options can
                                be specified by multiple copies of the command
                                line switch.  For example, "--final_models_by
                                LONGEST_DOPE --final_models_by SEQID" will
                                return two models.  Default: LONGEST_DOPE
      --output_final_models     Filename to store the models that pass the
                                model selection (see above).  Default: <runname>.fin
      --run_where               Where to run the job. Values are SGE or LOCAL.
                                The LOCAL is present mainly for testing purposes.
                                scrolls off the screen.
MODPIPE Configuration Options:
  These options will be used to create the ModPipe configuration file.
  See example file doc/modpipe.conf for definitions.

      --template_conf           Template configuration file.
                                Default: web/conf/template_modpipe.conf
      --tmpdirroot              TMPDIR Root: The root directory for the temporary 
                                directory on the nodes. Will add <runname> to this
                                Default: /scratch/ModWeb0b/
      --tmpdir                  TMPDIR: Temporary directory on the nodes. 
                                Default: /scratch/ModWeb0b/<runname>
                                This option overwrites --tmpdirroot
      --datdir                  DATDIR: Data directory to store results.
                                Default: <run_directory>/data
SGE Options:
  These options are used to specify the SGE job.

      --log_directory           Directory for storing the logs from SGE tasks.
                                Will be created if it does not exist.
      --job_name                Name for your job. Default: ModPipe
      --modpipeclusterbase      The modpipe base accessible from the cluster.          
                                Will default to the base in this ModPipe installation.
      --disks                   The names of the disk-complexes defined in SGE.
                                Specify as a string enclosed within double quotes.
                                Mandatory option.
      --nodes                   The names of the node-complexes defined in SGE.
                                Specify as a string enclosed within double quotes.
                                Mandatory option.
      --priority                Priority for running the tasks.  Default: -4
      --sge_file                Output filename to write the SGE job script.
                                Default: sge-modpipe.csh
MODPIPE Options:
  These are options that you would normally specify on the command-line of
  ModPipe.pl.

      --exit_stage              Choose the exit stage for the program. You can
                                quit after one of the following: PROFILE,
                                ALIGNMENTS, MODELS.  Default: NONE
      --hits_mode               Mode for calculating template hits. It is a
                                three-letter code containing 1 (true) or 0
                                (false) for each of Seq-Seq, Prf-Seq and
                                Prf-Prf, respectively. For instance, to
                                calculate Seq-Seq and Prf-Prf, set it to 101.
                                Default: 1110
      --score_by_tsvmod         [ON],OFF (in case tsvmod is not installed)
      --evalue_hits             E-value threshold to get hits against template 
                                databases. This value controls hits from all 
                                three searches.  Default: 1.0
      --clean_up                Flag to clean up the temporary directory
                                after all operations. [ON], OFF        
EOF
}

# Check for error messages in the SGE job output
sub process_job_output {
  my ($subrname, $logdir, $jobid) = @_;

  my @logs = get_sge_logfiles($logdir, $jobid);
  if (scalar(@logs) == 0) {
    die "${subrname}__E> Could not find any logfiles in $logdir for job $jobid";
  }
  for my $log (@logs) {
    open(LOG, $log) or die "${subrname}__E> Cannot open logfile $log: $!";
    while(<LOG>) {
      if (/__E>/) {
        die "${subrname}__E> Job $jobid reported an error in $log: $_";
      }
    }
    close LOG;
  }

  print "${subrname}__M> Job $jobid completed.\n";
}

# Get all SGE log files in the given directory for the given job
sub get_sge_logfiles {
  my ($logdir, $jobid) = @_;
  return glob("$logdir/*.o$jobid.*");
}
