# This file is part of ModPipe, Copyright 1997-2010 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

package PLLib::PDBUtils;
require Exporter;
@ISA    = qw(Exporter);
@EXPORT = qw( CheckPDBFile VerifyChnInPDB GetPDBAtomRec
              GetChnIds RWPDBAtomRec GetChnAtoms GetChnSeqIdx
              GetAtoms GetPDBResNum GetPDBData
              GetHetWaterCount GetPDBRepository);


use strict;
use PLLib::Utils;

sub GetPDBRepository {
  my ($include_local) = @_;

  # New-style PDB repository stucture (1abc in an 'ab' directory, gzipped)
  my $rep = '/netapp/database/pdb/remediated/pdb/';

  if ($include_local) {
    return "./:${rep}";
  } else {
    return $rep;
  }
}

sub GetHetWaterCount {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 1;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   my ($fname) = @_;

   my $fh = OpenFile($fname);

   #--- Read through filehandle
   my $pdbln;
   my $curr_het = '';
   my $curr_wat = '';
   my $watcnt = 0;
   my $hetcnt = 0;
   while ($pdbln = <$fh>){
      chomp $pdbln;
      if ( substr($pdbln,0,6) eq 'HETATM' && 
           substr($pdbln,17,10) ne $curr_het &&
           substr($pdbln,17,3) ne 'HOH'){
         $hetcnt++;
         $curr_het = substr($pdbln,17,10);
      } elsif ( substr($pdbln,0,6) eq 'HETATM' &&
                substr($pdbln,17,10) ne $curr_wat &&
                substr($pdbln,17,3) eq 'HOH'){
         $watcnt++;
         $curr_wat = substr($pdbln,17,10);
      }
   }
   close($fh);

   return($hetcnt, $watcnt);
}

sub GetPDBData {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 2;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   # -- Get arguments
   my ($pdb, $pdbrep) = @_;

   # -- Check for the existence of the PDB file
   my $pdbf = undef;
   unless ($pdbf = CheckPDBFile($pdb, $pdbrep)){
      warn "${subname}__E> Could not find PDB file   : $pdb\n";
      warn "${subname}__E> Specified PDB repositories: $pdbrep\n";
      return;
   }

   # -- Get the PDB lines
   my $atomrecs = GetPDBAtomRec($pdbf);

   # -- Get the number of chains
   my $chnids = GetChnIds($atomrecs);
   my $numchains = scalar($chnids);

   # -- Get the number of waters, and number of HETATM residues (non-water)
   my ($hetcnt, $watcnt) = GetHetWaterCount($pdbf);

   # -- Return stuff
   return ($numchains, $hetcnt, $watcnt);
}

sub GetPDBResNum {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 4;

   unless (scalar(@_) == $nargs){
      print "${subname}__E> Insufficient arguments\n" ;
      return;
   }

   #--- Get the input arguments
   my ($seq, $pdb, $chn, $pdbrep) = @_;

   # -- Check for the existence of the PDB file
   my $pdbf = undef;
   unless ($pdbf = CheckPDBFile($pdb, $pdbrep)){
      warn "${subname}__E> Could not find PDB file   : $pdb\n";
      warn "${subname}__E> Specified PDB repositories: $pdbrep\n";
      return;
   }

   # -- Extract sequence from PDB
   my ( $aaseq, $aaidx) = undef;
   unless ( ( $aaseq, $aaidx ) = GetChnSeqIdx( $pdbf, $chn )){
     warn "${subname}__E> Failed extracting PDB sequence: \"$chn\", $pdb\n";
     return;
   }

   # -- Get the beg/end of supplied sequence
   my ( $beg, $end );
   my $seq_copy = RemoveDashes( $seq );
   unless ( ($beg, $end) = FindSeqInSeq($seq_copy, $aaseq) ){
     warn "${subname}__E> Sequence not found in PDB: \"$chn\", $pdb\n";
     return;
   }

   # -- Return
   return($aaidx->[$beg - 1], $aaidx->[$end - 1]);
}

sub CheckPDBFile {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 2;

   unless (scalar(@_) == $nargs){
      print "${subname}__E> Insufficient arguments\n" ;
      return;
   }

   #--- Get the input argument
   my ($pdbf, $pdbrep) = @_;

   # -- Split the pdbred variable to indiv. paths
   my @pdbdirs = split(/:/, $pdbrep);

   # Handle PDB-style subdirectories (e.g. 1abc in 'ab' subdirectory)
   my $subdir = substr($pdbf, 1, 2);

   # Check PDB directories, extensions
   my @ext=('', '.gz', '.bz2', '.Z');
   foreach my $pdbdir (@pdbdirs) {
     foreach my $ext (@ext) {
       my @pdbnames=("${pdbf}${ext}","${pdbf}.pdb${ext}","pdb${pdbf}.ent${ext}");
       foreach my $pdbname (@pdbnames) {
         if (-e "${pdbdir}/${pdbname}") {
           return ("${pdbdir}/${pdbname}");
         } elsif (-e "${pdbdir}/${subdir}/${pdbname}") {
           my $return="${pdbdir}/${subdir}/${pdbname}";           
           return $return;
         } 
       }
     }
   }
   # File not found - return undef
}

sub VerifyChnInPDB {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 2;

   die "${subname}__E> Insufficient arguments\n" 
     unless (scalar(@_) == $nargs);

   #--- Get the input arguments
   my ($pdbf, $chn) = @_;

   #--- Fetch the lines containing ^ATOM records
   my $atomxyz = undef;
   unless ($atomxyz = GetPDBAtomRec($pdbf)){
     warn "${subname}__E> Failed reading coordinates: $pdbf\n";
     return;
   }

   #--- Get the number of chains
   my $chn_ids = undef;
   unless( $chn_ids = GetChnIds($atomxyz)){
     warn "${subname}__E> Failed geting chain identifiers: $pdbf\n";
     return;
   }
   my $numbchns = scalar(@$chn_ids);

   # -- Check if the specified chain id exists in the file
   #    return appropriate value
   if ( grep { /${chn}/ } @$chn_ids ){ return 1 } else { return };
}

sub GetPDBAtomRec {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 1;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);
   my ($fname) = @_;

   my $fh = OpenFile($fname);

   #--- Read through filehandle
   my $pdbln; my @atomrecs = ();
   while ($pdbln = <$fh>){
      chomp $pdbln;
      push(@atomrecs, $pdbln) 
        if ($pdbln =~ /^ATOM/ || 
            $pdbln =~ /^TER/ || 
            $pdbln =~ /^ENDMDL/ || 
            $pdbln =~ /^END/ ||
            # -- Brain dead processing for PDB
            # -- This is for Selenomethianine modification
            ($pdbln =~ /^HETATM/ && substr($pdbln, 17, 3) eq 'MSE') ||
            ($pdbln =~ /^HETATM/ && substr($pdbln, 17, 3) eq 'MET')
           );
   }
   close($fh);

   #--- return array
   return(\@atomrecs);
}

sub GetChnIds {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 1;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   #--- reassign input variables
   my $atomrecs = $_[0];

   #--- run through the coordinates and analyze
   my $curr_chn_id = ''; my $ln;
   my @chn_ids = ();
   foreach $ln (@$atomrecs){
      next if($ln =~ /^TER/ || $ln =~ /^ENDMDL/ || $ln =~ /^END/);
      if (substr($ln, 21, 1) ne $curr_chn_id) {
         $curr_chn_id = substr($ln, 21, 1) 
           if (substr($ln, 21, 1) ne $curr_chn_id);
         push(@chn_ids, $curr_chn_id) 
           if ( ! grep { /$curr_chn_id/ } @chn_ids );
      }
   }

   return(\@chn_ids);
}

sub RWPDBAtomRec {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 3;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   #--- reassign input variables
   my ( $orgpdb, $chn, $fh_newpdb ) = @_;

   #--- Fetch the lines containing ^ATOM records
   my $atomxyz = GetPDBAtomRec($orgpdb)
     or die "${subname}__E> Failed reading coordinates: $orgpdb\n";

   # -- Get a specific chain if specified
   if ( $chn !~ /^all/i ){

     # -- Get the atom records for this chain
     my $chnxyz = GetChnAtoms($atomxyz, $chn) or
       die "${subname}__E> Failed getting atoms for chain: \"$chn\" in $orgpdb\n";

     # -- Reassign the atomxyz pointer to that of chain
     undef $atomxyz;
     $atomxyz = $chnxyz;
   }

   # -- Write the ATOM records into new file
   foreach my $rec ( @$atomxyz ){
     printf $fh_newpdb "$rec\n";
   }

   # -- Return
   return 1;
}

sub GetChnAtoms {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 2;

   unless (scalar(@_) == $nargs){
      print "${subname}__E> Insufficient arguments\n" ;
      return ;
   }

   #--- reassign input variables
   my ($atomrecs, $chnid) = @_;

   #--- run through the coordinates and analyze
   my $ln; my $curr_res = '';my @chnrecs = ();
   foreach $ln (@$atomrecs){
      if (substr($ln, 21, 1) eq $chnid) {
         push(@chnrecs, $ln);
         last if( $ln =~ /^TER/ || $ln =~ /^ENDMDL/ || $ln =~ /^END/ );
      }
   }

   # -- Check if chain exists
   return if ( scalar(@chnrecs) < 1 );

   # -- Return chain records
   return(\@chnrecs);
}


sub GetChnSeqIdx {

  use PLLib::Sequence;

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 2;

   die "${subname}__E> Insufficient arguments\n" 
     unless (scalar(@_) == $nargs);

   #--- Get the input arguments
   my ($pdbf, $chn) = @_;

   #--- Fetch the lines containing ^ATOM records
   my $atomxyz = undef;
   unless( $atomxyz = GetPDBAtomRec($pdbf) ){
     warn "${subname}__E> Failed reading coordinates: $pdbf\n";
     return;
   }

   # -- Get the records of the specified chain
   my $chnrecs = undef;
   unless( $chnrecs = GetChnAtoms($atomxyz, $chn)){
     warn "${subname}__E> Failed getting coordinates for chain: \"$chn\" in $pdbf\n";
     return;
   }

   #--- run through the coordinates and analyze
   my $refresn = ' '; 
   my $aaseq = undef; my @aanum = ();
   foreach my $ln (@$chnrecs){
      my $resn = substr($ln, 22, 5);
      if ( $resn ne $refresn ){
        my $res1 = Res3ToRes1(substr($ln, 17, 3));
        $aaseq .= $res1;
        push @aanum, substr($ln, 22, 5);
        $refresn = $resn;
      }
   }

   # -- Return
   return ($aaseq, \@aanum);
}

sub GetAtoms {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 2;

   unless (scalar(@_) == $nargs){
      print "${subname}__E> Insufficient arguments\n" ;
      return;
   }

   #--- reassign input variables
   my ($atomrecs, $attype) = @_;

   #--- run through the coordinates and analyze
   my $ln; my $refln = ' '; my @atrecs = ();
   foreach $ln (@$atomrecs){
      if (substr($ln, 13, 2) eq $attype && 
          (substr($ln, 16, 1) eq ' ' || substr($ln, 16, 1) eq 'A') &&
          substr($ln,17,10) ne substr($refln,17,10)) {
         push(@atrecs, $ln);
         $refln = $ln if ($ln ne $refln);
      }
   }

   return(\@atrecs);
}

