# This file is part of ModPipe, Copyright 1997-2010 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

package PLLib::Sequence;
require Exporter;
@ISA    = qw(Exporter);
@EXPORT = qw( ParseSeq ParseFASTA ParsePIR
              ReadNextSeq ReadOneFASTA ReadOnePIR
              CleanSeq VerifySeqFormat 
              WritePIR WriteFASTA 
              CheckAlignCode IsCodeInFASTA IsCodeInPIR
              IsCodeInPROFILE TrimAlignment ExtractSequences
              AliDir FindSeqInSeq RemoveDashes AtomFilesPIR
              StructureCodes Res3ToRes1 FormatSeq GetCov
              IsGAP IsSTDAA CompareSeq GetPercGaps
              GetOverlap IsBRK ReadOneSPTR ParseSPTR parse_residue_number
             );
             

use strict;
use PLLib::Utils;

sub ParseSPTR {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($seq, $clean) = @_;

   # --- Split the sequence into separate lines
   my @seqlines = split("\n", $seq);

   # --- Initialize the varibles
   my ( $idcode, $prottyp, $pdbfile, $pdbbeg,
        $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
        $pdbsrc, $pdbresol, $pdbrfact, $sequence ) = undef;

   # --- Check if PIR sequence & parse
   my $line;
   if ( VerifySeqFormat($seq, "sptr") ){
      foreach $line (@seqlines){
         chomp $line;
         # --- Get the align_code
         if ( $line =~ /^AC\s{3}\w+;/ ){
            $line =~ s/;//;
            $line = (split(" ", $line))[1];
            $idcode = $line;
         } elsif ( $line =~ /^\s{5}/ ){
              $line =~ s/\s//g;
              $sequence .= $line;
           }
      }

      # --- Clean up the sequence
      $sequence = CleanSeq($sequence) if ( $clean =~ /ON/i );
   }

   # --- Return fields
   return ( $idcode, $prottyp, $pdbfile, $pdbbeg,
            $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
            $pdbsrc, $pdbresol, $pdbrfact, $sequence );
}


sub ReadOneSPTR {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   local *FH_SPTR = $_[0];
   local $/ = "//\n";

   # --- Read the next seq
   my $seq = undef;
   return (defined($seq = <FH_SPTR>)) ? $seq : undef;
}

sub IsBRK {

   # -- Get subroutine name
   my $subname = GetSubrName();

   # -- Check arguments
   my $nargs = 1;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   # -- Get input arguments
   my $res = $_[0];

   # -- Return if not a single character
   return if ( length($res) != 1 );

   # -- Check if input is a break character
   return 1 if ( $res =~ m/[\/]/ );

   # -- Return null 
   return;
}
sub GetOverlap {

   # -- Get subroutine name
   my $subname = GetSubrName();

   # -- Check arguments
   my $nargs = 4;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   # -- Get input arguments
   my ( $refbeg, $refend, $beg, $end ) = @_;

   # -- Get the extent of overlap
   my ($ovlp_beg, $ovlp_end) = 0;
   $ovlp_beg = ( $beg >= $refbeg ) ? $beg : $refbeg;
   $ovlp_end = ( $end <= $refend ) ? $end : $refend;

   # -- Calculate the actual overlap
   my $ovlp = 0;
   $ovlp = $ovlp_end - $ovlp_beg + 1 ;

   # -- Calculate the percentage overlap
   my $pcovlp = 0;
   $pcovlp = 100*$ovlp/($end - $beg + 1);

   # -- Get the extent of the non-overlapping region
   my $nonovlp = 0;
   $nonovlp += ($refbeg - $beg + 1) if ( $beg < $refbeg );
   $nonovlp += ($end - $refend + 1) if ( $end > $refend );

   # -- Calculate the percentage non-overlap
   my $pcnonovlp = 0;
   $pcnonovlp = 100*$nonovlp/($end - $beg + 1);
   
   # -- Return
   return ($ovlp, $pcovlp, $nonovlp, $pcnonovlp, $ovlp_beg, $ovlp_end);
}



sub GetPercGaps {

   # -- Get subroutine name
   my $subname = GetSubrName();

   # -- Check arguments
   my $nargs = 3;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   # -- Get input arguments
   my ( $alifile, $aliformat, $refcode ) = @_;

   # -- Return if alignment not found
   return unless ( -e $alifile );

   # -- Open alifile handle
   my $alifh = OpenFile( $alifile );

   # -- Start reading in the sequences
   my @sequences = (); my $refseq = undef;
   while ( defined(my $seq = ReadNextSeq($alifh, $aliformat)) ){

      # -- Parse alignment unit to get sequence and alicode
      my ($idcode, $sequence) = (ParseSeq($seq, $aliformat,'OFF'))[0,11];

      if( $idcode eq $refcode ){
         $refseq = $sequence;
      } else {
           push(@sequences, $sequence);
        }
   }
   close( $alifh );

   # -- Now you have all the sequences from the alignment in an array
   #    Run through it to calculate numbers
   my ($alipos, $gappos, $idpos) = 0; my @gapratios = ();
   foreach my $seq ( @sequences ){
      unless (( $alipos, $gappos, $idpos ) = CompareSeq( $refseq, $seq )){
         warn "${subname}__E> Sequence comparison failed\n";
         next;
      }

      # -- Calculate ratio of gapped pos to aligned pos
      my $gapratio = 100*$gappos/$alipos;

      # -- Store values in an array
      push @gapratios, $gapratio;
   }

   # -- Sort the array of gapratios (ascending)
   @gapratios = sort { $a <=> $b } @gapratios;

   # --  Get the lowest value
   my $gap_perc = shift @gapratios;

   # -- Return
   return $gap_perc;
}

sub CompareSeq {

   # -- Get subroutine name
   my $subname = GetSubrName();

   # -- Check arguments
   my $nargs = 2;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   # -- Get input arguments
   my ( $refseq, $myseq ) = @_;

   # -- Return if no sequence 
   return unless ( length($refseq) > 0 && length($myseq) > 0 );

   # -- Return if lengths are different
   return unless ( length($refseq) == length($myseq) );

   # -- Get numbers
   my ($alipos, $gappos, $idpos) = 0;
   for( my $i = 0; $i < length($refseq); $i++ ){

      # -- Get residue types
      my $res1 = substr($refseq, $i, 1);
      my $res2 = substr($myseq, $i, 1);

      # -- Increment aligned position
      $alipos++ if ( IsSTDAA($res1) && IsSTDAA($res2) );

      # -- Skip if both positions are gaps
      next if ( IsGAP($res1) && IsGAP($res2) );

      # -- Increment gap positions
      $gappos++ if ( IsGAP($res1) || IsGAP($res2) );

      # -- Increment identities
      $idpos++ if ( IsSTDAA($res1) && IsSTDAA($res2) && $res1 eq $res2 );
   }

   # -- Return values
   return $alipos, $gappos, $idpos;
}


sub IsSTDAA {

   # -- Get subroutine name
   my $subname = GetSubrName();

   # -- Check arguments
   my $nargs = 1;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   # -- Get input arguments
   my $res = $_[0];

   # -- Return if not a single character
   return if ( length($res) != 1 );

   # -- Check if input is a AA residue
   return 1 if ( $res =~ m/[ACDEFGHIKLMNPQRSTVWY]/ );

   # -- Return null 
   return;
}

sub IsGAP {

   # -- Get subroutine name
   my $subname = GetSubrName();

   # -- Check arguments
   my $nargs = 1;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   # -- Get input arguments
   my $res = $_[0];

   # -- Return if not a single character
   return if ( length($res) != 1 );

   # -- Check if input is a gap character
   return 1 if ( $res =~ m/[-]/ );

   # -- Return null 
   return;
}




sub GetCov {

   # -- Get subroutine name
   my $subname = GetSubrName();

   # -- Check arguments
   my $nargs = 4;

   die "${subname}__E> Insufficient arguments\n"
     unless (scalar(@_) == $nargs);

   # -- Get input arguments
   my ( $refbeg, $refend, $beg, $end ) = @_;

   # -- Get the extent of coverage
   my ($cov_beg, $cov_end) = 0;
   $cov_beg = ( $beg >= $refbeg ) ? $beg : $refbeg;
   $cov_end = ( $end <= $refend ) ? $end : $refend;

   # -- Calculate the actual coverage
   my $cov = 0;
   $cov = $cov_end - $cov_beg + 1 ;

   # -- Calculate the percentage coverage
   my $pccov = 0;
   $pccov = 100*$cov/($refend - $refbeg + 1);
   
   # -- Return
   return ($cov, $pccov);
}

sub FormatSeq {

   #--- Get subroutine name
   my $subname = GetSubrName();

   # -- check arguments
   my $nargs = 14;

   unless (scalar(@_) == $nargs){
     print "${subname}__D> Insufficient arguments\n" ;
     return;
   }

   # -- assign arguments
   my ($align_code, $prottype, $pdbcode, 
       $pdbstart, $pdbchn1, $pdbstop, $pdbchn2,
       $pdbnam, $pdbsrc, $pdbresol, $pdbrfact,
       $seq, $cols, $outfmt) = @_;

   # -- Format according to request
   my $fmtseq = undef;
   if ( $outfmt =~ /^FASTA$/i ){
     $fmtseq = WriteFASTA($align_code, $prottype, $pdbcode, 
                          $pdbstart, $pdbchn1, $pdbstop, $pdbchn2,
                          $pdbnam, $pdbsrc, $pdbresol, $pdbrfact,
                          $seq, $cols);
   } else {
       $fmtseq = WritePIR($align_code, $prottype, $pdbcode, 
                          $pdbstart, $pdbchn1, $pdbstop, $pdbchn2,
                          $pdbnam, $pdbsrc, $pdbresol, $pdbrfact,
                          $seq, $cols);
     }

  # -- Return formatted sequence
  return ( $fmtseq );
}


sub Res3ToRes1 {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 1;

   unless (scalar(@_) == $nargs){
      print "${subname}__E> Insufficient arguments\n" ;
      return;
   }

   #--- hash for conversion
   my %r31 = ( "ALA" => "A", "CYS" => "C", "ASP" => "D", "GLU" => "E", "PHE" => "F",
               "GLY" => "G", "HIS" => "H", "HSD" => "H", "ILE" => "I", "LYS" => "K",
               "LEU" => "L", "MET" => "M", "ASN" => "N", "PRO" => "P", "GLN" => "Q",
               "ARG" => "R", "SER" => "S", "THR" => "T", "VAL" => "V", "TRP" => "W",
               "TYR" => "Y", "CSH" => "C", "PR0" => "P", "PRZ" => "P", "UNK" => "X",
               "ASX" => "B", "GLX" => "Z", "CSS" => "C", "CYX" => "C", "MSE" => "M"
             );


   #-- reassign the variables
   my $res3 = $_[0];

   #--- check length of input
   if ( length($res3) != 3 ) {
      print "$subname __D> Input is not a 3-letter code: $res3\n" ;
      return;
   }

   #--- convert input to upper case
   $res3 =~ tr/a-z/A-Z/;

   #--- do the actual conversion
   my $res1 = "";
   $res1 = $r31{$res3};

   #--- Assign 'X' to any unknown residue
   $res1 = 'X' if ($res1 eq "");

   return $res1;
}



sub StructureCodes{

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 1;

   unless (scalar(@_) == $nargs){
     print "${subname}__D> Insufficient arguments\n" ;
     return;
   }

   #--- Assign input variables
   my $pirfile = $_[0];

   # --- Check for the existence of the file
   unless ( -e $pirfile ){
      warn "${subname}__E> Could not find PIR file: $pirfile\n";
      return;
   }

   # --- Open PIR file
   my $fh_pir = OpenFile( $pirfile );

   # --- Read the file and get the structure (align) codes
   my @strcodes = ();
   while ( defined(my $seq = ReadNextSeq($fh_pir, 'PIR')) ){
   
      # --- Verify that the sequence is in PIR format
      unless ( VerifySeqFormat($seq, "pir") ){
         warn "${subname}__E> Sequence not in PIR format\n";
         warn "               Alignment File: $pirfile\n";
         return;
      }

      # --- Parse the sequence to get info
      my ($strcode, $prottyp) = (ParseSeq($seq, 'PIR', 'OFF'))[0,1];
   
      # --- Copy the atomfile record
      push(@strcodes, $strcode) if ( $prottyp =~ /^structure/i );
   }

   # --- Close the file
   close($fh_pir);

   # --- Return the values
   return @strcodes;
}



sub AtomFilesPIR{

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 1;

   unless (scalar(@_) == $nargs){
     print "$subname __D> Insufficient arguments\n" ;
     return;
   }

   #--- Assign input variables
   my $pirfile = $_[0];

   # --- Check for the existence of the file
   unless ( -e $pirfile ){
      warn "${subname}__E> Could not find PIR file: $pirfile\n";
      return;
   }

   # --- Open PIR file
   my $fh_pir = OpenFile( $pirfile );

   # --- Read the file and get the atom_file codes
   my @atomfiles = ();
   while ( defined(my $seq = ReadNextSeq($fh_pir, 'PIR')) ){
   
      # --- Parse the sequence to get info
      my ($prottyp, $atomfile) = (ParseSeq($seq, 'PIR', 'OFF'))[1,2];
   
      # --- Copy the atomfile record
      push(@atomfiles, $atomfile) if ( $prottyp =~ /^structure/i );
   }

   # --- Close the file
   close($fh_pir);

   # --- Return the values
   return @atomfiles;
}

sub RemoveDashes {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 1;

   unless (scalar(@_) == $nargs){
     print "$subname __D> Insufficient arguments\n" ;
     return;
   }

   #--- Assign input variables
   my $seq = $_[0];

   #--- remove all dashes from sequence
   $seq =~ s/\-//g;

   #--- Return array
   return $seq;
}


sub FindSeqInSeq {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 2;

   unless (scalar(@_) == $nargs){
      print "$subname __E> Insufficient arguments\n" ;
      return;
   }

   #--- reassign input variables
   my ($subseq, $seq) = @_;

   #--- run through the seq2 to find all matching seq1
   my $subseq_len = length($subseq);
   my $seq_len    = length($seq);

   # --- fetch the bounds
   my ($start, $stop);
   $start = index($seq, $subseq);
   $stop  = $start + $subseq_len - 1;

   # -- Check if indexing suceeded
   return if ( $start < 0 );

   # --- Fix the ends
   $start = $start + 1;
   $stop  = $stop  + 1;

   # --- Return bounds
   return($start, $stop);
}


sub AliDir {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return (1, undef);
   }

   # --- Reassign input arguments
   my $sequence = $_[0];

   # --- Create sequence dir string
   my $subdir = substr($sequence, 0, 2);
   my $dir    = "$subdir/$sequence";

   # --- Return value
   return $dir;

}

sub TrimAlignment {

   #--- Get subroutine name
   my $subname = GetSubrName();

   #--- Check arguments
   my $nargs = 2;

   unless (scalar(@_) == $nargs){
      print "$subname __D> Insufficient arguments\n" ;
      return;
   }

   #--- reassign input variables
   my ($target, $template) = @_;

   #--- Check lengths
   if ( length($target) ne length($template) ){
      print "$subname __E> Target/Template don't match in length\n";
      return;
   }

   #--- Initialize variables
   my ( $template_beg, $template_end, $offset ) = undef;
   my ( $i ) = undef;

   #-- Get the beginning position of template
   $template_beg = 0;
   for($i = 0; $i < length($template); $i++ ){
      last if ( substr($template, $i, 1) ne '-' );
      $template_beg++;
   }

   #-- Get the ending position of template
   $template_end = length($template);
   for($i = length($template); $i > 0; $i--){
      $offset = $i - length($template) - 1;
      last if ( substr($template, $offset, 1) ne '-' );
      $template_end--;
   }

   $template = substr($template, $template_beg, $template_end - $template_beg );
   $target   = substr($target  , $template_beg, $template_end - $template_beg );

   #--- Check lengths
   if ( length($target) ne length($template) ){
      print "$subname __E> Trimmed target/template don't match in length\n";
      return;
   }

   #--- Remove aligned gaps
   my ($target2, $template2) = undef;
   for($i = 0; $i < length($target); $i++){
      if ( substr($target, $i, 1) ne '-' || substr($template, $i, 1) ne '-' ){
         $target2   .= substr($target, $i, 1);
         $template2 .= substr($template, $i, 1);
      }
   }

   return ($target2, $template2);
}


sub IsCodeInPROFILE {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($fh, $code) = @_;
   local *FH_PRF = $fh;

   # --- Check for code
   my $found = undef;
   while ( my $ln = <FH_PRF> ){
      next if ( $ln =~ /^#/ );
      if ( (split(" ", $ln))[1] eq $code ){
         $found = 1;
         last;
      }
   }

   # --- Return value
   return $found;
}

sub IsCodeInPIR {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($fh, $code) = @_;
   local *FH_PIR = $fh;

   # --- Check for code
   my $found = undef;
   while ( my $ln = <FH_PIR> ){
      next if ( $ln !~ /^>P1;/ );
      chomp $ln;
      $ln =~ s/^>P1;//;
      if ( (split(" ", $ln))[0] eq $code ){
         $found = 1;
         last;
      }
   }

   # --- Return value
   return $found;
}


sub IsCodeInFASTA {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($fh, $code) = @_;
   local *FH_FASTA = $fh;

   # --- Check for code
   my $found = undef;                                                          
   while ( my $ln = <FH_FASTA> ){                                                
      next if ( $ln !~ /^>/ );
      chomp $ln;
      $ln =~ s/^>//;
      if ( (split(" ", $ln))[0] eq $code ){
         $found = 1;
         last;
      }
   }                                                                           

   # --- Return value
   return $found;
}


sub CheckAlignCode {

   #--- Get subroutine name
   my $subname = GetSubrName();

   # -- check arguments
   my $nargs = 3;

   unless (scalar(@_) == $nargs){
     print "${subname}__D> Insufficient arguments\n" ;
     return;
   }

   # -- Reassign input arguments
   my ( $seqfile, $seqformat, $seqcode ) = @_;

   # -- Check for file
   return unless ( -e $seqfile );

   # -- Check for the sequence code
   my $result = undef;
   my $fh_seq = IO::File->new("< $seqfile");
   if ( $seqformat =~ /FASTA/i ){
      $result = IsCodeInFASTA($fh_seq, $seqcode);
   } elsif ( $seqformat =~ /^PIR$/i ){
        $result = IsCodeInPIR($fh_seq, $seqcode);
     }
     elsif ( $seqformat =~ /^PROFILE$/i ){
        $result = IsCodeInPROFILE($fh_seq, $seqcode);
     }
   close($fh_seq);

   # -- Return value
   return $result;
}

sub WriteFASTA {

   #--- Get subroutine name
   my $subname = GetSubrName();

   # -- check arguments
   my $nargs = 13;

   unless (scalar(@_) == $nargs){
     print "${subname}__D> Insufficient arguments\n" ;
     return;
   }

   # -- assign arguments
   my ($align_code, $prottype, $pdbcode, 
       $pdbstart, $pdbchn1, $pdbstop, $pdbchn2,
       $pdbnam, $pdbsrc, $pdbresol, $pdbrfact,
       $seq, $cols) = @_;
   my ($nline, $n, $outseq);

   undef $outseq; my $lastnewline = 0;
   $lastnewline = 1 if ( length($seq)/$cols == int(length($seq)/$cols) );

   $outseq  = ">$align_code\n";

   $nline = int(length($seq)/$cols);
   for $n ( 1 .. $nline ) {
      $outseq .= substr($seq,($n-1)*$cols,$cols) . "\n";
   }

   $outseq .= substr($seq,$nline*$cols) . "\n" if ( $lastnewline == 0 );

   return ($outseq);
}



sub WritePIR {

   #--- Get subroutine name
   my $subname = GetSubrName();

   # -- check arguments
   my $nargs = 13;

   unless (scalar(@_) == $nargs){
     print "${subname}__D> Insufficient arguments\n" ;
     return;
   }

   # -- assign arguments
   my ($align_code, $prottype, $pdbcode, 
       $pdbstart, $pdbchn1, $pdbstop, $pdbchn2,
       $pdbnam, $pdbsrc, $pdbresol, $pdbrfact,
       $seq, $cols) = @_;
   my ($nline, $n, $outseq);

   undef $outseq; my $lastnewline = 0;
   $lastnewline = 1 if ( length($seq)/$cols == int(length($seq)/$cols) );

   $outseq  = ">P1;$align_code\n";
   $outseq .= "$prottype:$pdbcode:$pdbstart:$pdbchn1:$pdbstop:$pdbchn2:";
   $outseq .= "$pdbnam:$pdbsrc:$pdbresol:$pdbrfact\n";

   $nline = int(length($seq)/$cols);
   for $n ( 1 .. $nline ) {
      $outseq .= substr($seq,($n-1)*$cols,$cols) . "\n";
   }

   $outseq .= substr($seq,$nline*$cols) . "\n" if ( $lastnewline == 0 );
   $outseq .= "*" . "\n";

   return ($outseq);
}

sub ParseSeq {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 3;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($seq, $inpformat, $clean) = @_;

   # --- Initialize the varibles
   my ( $idcode, $prottyp, $pdbfile, $pdbbeg,
        $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
        $pdbsrc, $pdbresol, $pdbrfact, $sequence ) = undef;

   # --- Parse the sequence
   if ( $inpformat =~ /fasta/i ){
      ( $idcode, $prottyp, $pdbfile, $pdbbeg,
        $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
        $pdbsrc, $pdbresol, $pdbrfact, $sequence ) = ParseFASTA($seq, $clean);
   } elsif ( $inpformat =~ /pir/i ){
        ( $idcode, $prottyp, $pdbfile, $pdbbeg,
          $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
          $pdbsrc, $pdbresol, $pdbrfact, $sequence ) = ParsePIR($seq, $clean);
     }
     elsif ( $inpformat =~ /sptr/i ){
        ( $idcode, $prottyp, $pdbfile, $pdbbeg,
          $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
          $pdbsrc, $pdbresol, $pdbrfact, $sequence ) = ParseSPTR($seq, $clean);
     }

   # --- Return values
   return ( $idcode, $prottyp, $pdbfile, $pdbbeg,
            $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
            $pdbsrc, $pdbresol, $pdbrfact, $sequence );
}

sub ParseFASTA {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($seq, $clean) = @_;

   # --- Split the sequence into separate lines
   my @seqlines = split("\n", $seq);

   # --- Initialize the varibles
   my ( $idcode, $prottyp, $pdbfile, $pdbbeg,
        $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
        $pdbsrc, $pdbresol, $pdbrfact, $sequence ) = undef;

   my $line;
   if ( VerifySeqFormat($seq, "fasta") ){
      foreach $line (@seqlines){
         chomp $line;
         # --- Get the align_code
         if ( $line =~ /^>/ ){
            $line =~ s/^>//;
            $line = (split(" ", $line))[0];
            $idcode = $line;
         } else {
              $sequence .= $line;
           }
      } 

      # --- Clean up the sequence
      $sequence = CleanSeq($sequence) if ( $clean =~ /ON/i );
   } else {
        die "${subname}__E> Sequence not in FASTA format\n";
     }

   # --- Return fields
   return ( $idcode, $prottyp, $pdbfile, $pdbbeg,
            $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
            $pdbsrc, $pdbresol, $pdbrfact, $sequence );
}


sub ParsePIR {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($seq, $clean) = @_;

   # --- Split the sequence into separate lines
   my @seqlines = split("\n", $seq);

   # --- Initialize the varibles
   my ( $idcode, $prottyp, $pdbfile, $pdbbeg,
        $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
        $pdbsrc, $pdbresol, $pdbrfact, $sequence ) = undef;

   # --- Check if PIR sequence & parse
   my $line;
   if ( VerifySeqFormat($seq, "pir") ){
      foreach $line (@seqlines){
         chomp $line;
         # --- Get the align_code
         if ( $line =~ /^>P1;/ ){
            $line =~ s/\>P1;//;
            $line = (split(" ", $line))[0];
            $idcode = $line;
         } elsif ( $line =~ /^structure/i || $line =~ /^sequence/i ){
              ($prottyp, $pdbfile, $pdbbeg,
               $pdbchn1, $pdbend, $pdbchn2,
               $pdbnam, $pdbsrc, $pdbresol,
               $pdbrfact) = split(/:/, $line);
           }
           else {
              $sequence .= $line;
           }
      }

      # -- Remove trailing * from the PIR sequence
      $sequence =~ s/\*$//;

      # --- Clean up the sequence
      $sequence = CleanSeq($sequence) if ( $clean =~ /ON/i );
   }

   # --- Return fields
   return ( $idcode, $prottyp, $pdbfile, $pdbbeg,
            $pdbchn1, $pdbend, $pdbchn2, $pdbnam, 
            $pdbsrc, $pdbresol, $pdbrfact, $sequence );
}

sub VerifySeqFormat {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($sequence, $inpformat) = @_;

   # --- Split sequence lines
   my @seqlines = split("\n", $sequence);

   # --- Verify format
   my ( $pircnt, $fsacnt, $sptrcnt );
   $pircnt = $fsacnt = $sptrcnt = 0;
   foreach my $ln ( @seqlines ){
      $fsacnt++ if ($ln =~ /^>[A-Z0-9a-z]/ && $ln !~ /^>P1;/ );
      $pircnt++ if ($ln =~ /^>P1;/ );
   }

   # Do special matching of SP/Tr sequences
   my ( $id, $ac, $sq, $sl) = 0;
   foreach my $ln ( @seqlines ){
      $id++ if ( $ln =~ /^ID/ );
      $ac++ if ( $ln =~ /^AC/ );
      $sq++ if ( $ln =~ /^SQ   SEQUENCE/ );
      $sl++ if ( $ln =~ /^\/\//);
   }
   $sptrcnt++ if ( $id > 0 && $ac > 0 && $sq > 0 && $sl > 0 );

   # --- Return values
   return if(( $fsacnt > 0 && $pircnt > 0 ) ||
             ( $fsacnt > 0 && $sptrcnt > 0 ) ||
             ( $pircnt > 0 && $sptrcnt > 0 ));

   return $fsacnt if ( $inpformat =~ /fasta/i );
   return $pircnt if ( $inpformat =~ /pir/i );
   return $sptrcnt if ( $inpformat =~ /sptr/i );
}


sub CleanSeq {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my $sequence = $_[0];

   # --- Clean up sequence
   $sequence =~ s/\n//mg;    # --- make one line
   $sequence =~ s/\r//mg;    # --- make one line
   $sequence =~ s/\W+//g;    # --- eliminate non-word char
   $sequence =~ s/_+//g;     # --- eliminate underscore, since not covered by \W
   $sequence =~ s/\d+//g;    # --- eliminate numbers
   $sequence =~ s/\*//;      # --- eliminate *'s
   $sequence =~ s/\s+//g;    # --- eliminate spaces
   $sequence =~ tr/a-z/A-Z/; # --- convert to uppercase
   $sequence =~ s/B/N/g;     # --- convert ASX to ASN
   $sequence =~ s/Z/Q/g;     # --- convert GLX to GLN
   $sequence =~ s/[^ACDEFGHIKLMNPQRSTVWY]/G/g; # --- convert everything else to GLY

   return $sequence;
}

sub ReadNextSeq {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 2;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   my ($fh_seq, $seqformat) = @_;

   # --- Fetch the sequence, appropriately
   my $seq = undef;
   if ( $seqformat =~ /PIR/i ){
      $seq = ReadOnePIR($fh_seq);
   }
      elsif ( $seqformat =~ /\bSPTR\b/i ){
        $seq = ReadOneSPTR($fh_seq);
      }
      else {
        $seq = ReadOneFASTA($fh_seq);
      }

   # --- Return sequence
   return($seq);
}

sub ReadOnePIR {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   local *FH_PIR = $_[0];
   local $/ = "\n>";

   # --- Read the next seq
   my $seq = undef;
   while ( defined($seq = <FH_PIR>) ){
      $seq =~ s/\nC;.*\n/\n/g;      # - Clear comment lines if any
      if ( $seq =~ /^P1;/ || 
           $seq =~ /^>P1;/ ){	          # - Handle the first sequence
         $seq =~ s/^>//;                  # - Just to handle the first sequence
         $seq =~ s/^/>/;                  # - Restore the begining character
         $seq =~ s/\*\s*\n*\>.*\Z/\*\n/s; # - Trailing nonsense
         last ;
      }
   }

   # --- Return Sequence
   return($seq);
}

sub ReadOneFASTA {

   # --- Get subroutine name
   my $subname = GetSubrName();

   # --- Check arguments
   my $nargs = 1;

   unless ( scalar(@_) == $nargs ){
      print "${subname}__D> Insufficient arguments\n";
      return;
   }

   # --- Reassign input arguments
   local *FH_FSA = $_[0];
   local $/ = "\n>";

   # --- Read the next seq
   my $seq = undef;
   while ( defined($seq = <FH_FSA>) ){
      $seq =~ s/\nC;.*\n/\n/;        # - Clear comment lines if any
      if ( ($seq =~ /^[0-9A-Za-z]/ || $seq =~ /^>[0-9A-Za-z]/ ) && 
            $seq !~ /^P1;/ ){
         $seq =~ s/^>//;         # - Handle the first sequence
         $seq =~ s/^/>/;         # - Restore the begining character
         $seq =~ s/\n*\>\Z/\n/s; # - Trailing nonsense
         last;
      }
   }

   # --- Return Sequence
   return($seq);
}

# Convert a residue number to our preferred internal form. Use an integer
# if possible; otherwise (e.g. it contains an insertion code) use a string.
sub parse_residue_number {
  my ($r) = @_;
  if ($r + 0 eq $r) {
    return $r + 0;
  } else {
    $r =~ s/^\s+//;
    $r =~ s/\s+$//;
    return $r;
  }
}

# Given a database of sequences in PIR or FASTA format, generate a new file
# containing only the sequence codes specified.
sub ExtractSequences {
  my ($seqdb, $outfile, $format, $listcodes) = @_;
  my $subrname = GetSubrName();

  my $outfh = OpenNewFile($outfile);

  # -- Convert list codes to an index hash for faster access
  my %codes = ();
  foreach my $lcode (@$listcodes) {
    $codes{$lcode} = 1;
  }

  # -- Start processing the file
  my $seqcount = 0;
  my $dbfh = OpenFile($seqdb) or
      die "${subrname}__E> Failed opening file: $seqdb";
  while (defined(my $seq = ReadNextSeq($dbfh, $format))) {
    # -- Get the sequence code
    my $scode = (ParseSeq($seq, $format, 'OFF'))[0];

    # -- Print out the sequence if requested
    if ($codes{$scode}) {
      $seqcount++;
      print $outfh $seq;
    }

    # -- Stop if all requested sequences have been processed
    last if ($seqcount == @$listcodes);
  }
  close ($dbfh);
  close ($outfh);
}
