#!/usr/bin/perl
#use strict;
use DBI; 		# MySQL database interface
use Getopt::Long; 	# Parse arguments

#---- Initial variables
my $label   = "CLS_PATCHES__>";
my $mysqldb1= "PDBc";
my $mysqldb2= "PDBe";
my $mysqlrm = "eva.compbio.ucsf.edu";
my $mysqlus = "dbali";
my $password= "dbalisecret";

#---- Get Options
GetOptions ( "l=s" => \$patlst,
             "p=f" => \$mp,
             "i=f" => \$mi,
             "s=f" => \$ms,
             "o=s" => \$output,
             "dbg=i" => \$dbg,
             );

$options  = "$label Options:\n";
$options .= "$label \t-l\t= File with a list of patches to cluster [*]\n";
$options .= "$label \t-p\t= Minimal P-value fraction [0.5]\n";
$options .= "$label \t-i\t= Minimal identity [50]\n";
$options .= "$label \t-s\t= Minimal coverage of the patch [75%]\n";
$options .= "$label \t-o\t= Output file with clusters [list.grp]\n";
$options .= "$label \t-dbg\t= Debugging [0]\n";

#---- Get options and set defaults
unless ($patlst){ print $options; exit 0; }
unless ($mp)    { $mp = 0.5; }
unless ($ms)    { $ms =  75; }
unless ($mi)    { $mi =  50; }
unless ($output){ $output =  $patlst.".grp"; }
unless ($dbg)   { $dbg=   0; }

#---- Print usage and options selected
$usage  = "$label Usage:\n";
$usage .= "$label \t-l\t= $patlst\n";
$usage .= "$label \t-p\t= $mp\n";
$usage .= "$label \t-i\t= $mi\n";
$usage .= "$label \t-s\t= $ms\n";
$usage .= "$label \t-o\t= $output\n";
$usage .= "$label \t-dbg\t= $dbg\n";
$usage .= "$label\n";
print $usage;

#($rc,$rp,$rr) = &FromPatchFile($patlst);
($rc,$rp,$pres) = &FromPatchFile($patlst);
@chains   = @$rc; $tchains = $#chains+1;
@patches  = @$rp; $tpatches = $#patches+1;
print "$label A total of $tpatches patches for $tchains chains have been read...\n";

#---- Connect to MySQL database PDBc
my $dsn1 = "DBI:mysql:".$mysqldb1.":".$mysqlrm;
my $dbh1 = DBI->connect($dsn1, $mysqlus, $password, %mysql_err) or print "$main::label Cannot connect to database. Error $DBI::err\n";
my $dsn2 = "DBI:mysql:".$mysqldb2.":".$mysqlrm;
my $dbh2 = DBI->connect($dsn2, $mysqlus, $password, %mysql_err) or print "$main::label Cannot connect to database. Error $DBI::err\n";

#---- Get Similar chains to chains with patches.
$nc = 0;
foreach $c (@chains) {
   $nc++;
   if (length($c) == 4) { $c = $c."_"; }
   $chain_table = "pdb".$c;
   #---- Get sequence
   ($rseq,@{"res".$c}) = &GetSeq($c);
   #---- Get Inner Pvalue
   $query  = "SELECT FORMAT(Pvalue,2) FROM $chain_table WHERE ChainB = '$c'";
   my @res = &QueryData($dbh1,$query);
   if ($mp > 1) { 
      $pvalue{$c} = $res[0]; $mpvalue{$c} = $mp; 
   } else {
      $pvalue{$c} = $res[0]; $mpvalue{$c} = $pvalue{$c}*$mp; if ($mpvalue{$c} < 3.5) { $mpvalue{$c} = 3.5;  }
   } 
   #---- Get similar chains
   $query  = "SELECT ChainB, Eqv FROM $chain_table WHERE Pvalue >= $mpvalue{$c} order by Pvalue desc";
   my @simchains = &QueryData($dbh2,$query);
   $tsimchains = $#simchains+1;
   print "$label Chain $c has an inner pvalue of $pvalue{$c} [$mpvalue{$c}] and $tsimchains similar chains -$nc/$tchains-\n";
   $link{$c}{$c} = 1; #inner match
   foreach $sc (@simchains) { 
      ($simchain,@eqvs) = split(/\s+/,$sc); 
      if ($dbg >= 3) { print "$label Chain $c is similar to $simchain [@eqvs]\n"; }
      $link{$c}{$simchain} = 1;
      $link{$simchain}{$c} = 1;
   }
}

#---- Run all-against-all patches and calculate which ones can be clustered and to which ones can be extended.
#-- UnClustered patch
foreach $p (@patches) { $g{$p} = 0; }
#-- Start comparison of all against all
@clusters = ();
for $i (0..$#chains) {
   $c1 = $chains[$i];
   print "$label Clustring patches in chain $c1... $i/$tchains\n";
   for $j ($i+1..$#chains) {
#   for $j (0..$#chains) {
#      if ($i == $j) { next; }
      $c2 = $chains[$j];
      #-- Are the chains similar? Sequence identity?
      if ($link{$c1}{$c2}) {
         if ($dbg) { print "$label Chain |$c1| vs |$c2| are linked...\n"; }
         @ali1 = split(/\s+/,(&QueryData($dbh2,"SELECT Eqv FROM pdb$c1 WHERE ChainB = '$c2'"))[0]);
         @ali2 = split(/\s+/,(&QueryData($dbh2,"SELECT Eqv FROM pdb$c2 WHERE ChainB = '$c1'"))[0]);
         if ($dbg) { print "$label Chain $c1 --> [@ali1]\n"; }
         if ($dbg) { print "$label Chain $c2 --> [@ali2]\n"; }
         %eqvs = {}; 
         for $k (0..$#ali1) { $a1 = $ali1[$k]; $a2 = $ali2[$k]; $eqvs{$a1}{$a2} = 1; }
         #-- Run all-against-all patches comparison for the two chains
         foreach my $p1 (keys %{$pres->{$c1}}) {
            if ($g{$p1}) { next; }
            push(@clusters,$p1); 
            foreach my $p2 (keys %{$pres->{$c2}}) {
               if ($g{$p2}) { next; }
#print "$c1 [$p1] vs $c2 [$p2] [$g{$p1}/$g{$p2}] --> ";
               $sid = 0; $cov = 0; $min = 0;
               @res1 = split(/,/,$pres->{$c1}{$p1}); 
               @res2 = split(/,/,$pres->{$c2}{$p2});
               if ($#res1 <= $#res2) { $min = $#res1+1; } else { $min = $#res2+1; }
               for $k (0..$#res1) { 
                  $a1 = $res1[$k]; 
                  for $kk (0..$#res1) { 
                     $a2 = $res2[$kk]; 
                     # coverage and sequence identity of the coverage
                     if ($eqvs{$a1}{$a2}) { 
                        $cov++; 
                        # seqid
                        if ($rseq->{$c1}->{$a1} eq $rseq->{$c2}->{$a2}) { $sid++; }
                        #print "$c1 -> $a1 = $rseq->{$c1}->{$a1}\n";
                        #print "$c2 -> $a2 = $rseq->{$c2}->{$a2} ----> $sid\n";
                     }
                  }
               }
               $cov = 100*$cov/$min;
               $sid = 100*$sid/$min;
               if ($dbg >= 2) {
                  print "$label Comparing $p1 [@res1]\n";
                  print "$label           $p2 [@res2]\n";
                  print "$label CV: $cov\n";
                  print "$label ID: $sid\n\n";
               }
#print "Cov: $cov and Sid: $sid\n";
               if ($cov >= $ms and $sid >= $mi) {
#print "SIMILAR $p1 <-- $p2 with $cov coverage and $sid identity\n";
                  $g{$p2} = 1;
                  push(@{$p1},$p2);
               }
            }
         }
      }
   }
}
#---- Final clusters
#-- Add orphan patches
for $i (0..$#chains) {
   $c = $chains[$i];
   foreach my $p (keys %{$pres->{$c}}) {
      if (!$g{$p}) { push(@clusters,$p); }
   }
}
@clusters = &Unique(@clusters);
my $tclusters = $#clusters+1;

#---- Print clusters
print "$label Writting output...\n";
open(OUT,">$output");
print OUT "# Input file:            $patlst\n";
print OUT "# Structural similarity: $mp\n";
print OUT "# Structural coverage:   $ms\n";
print OUT "# Sequencial identity:   $mi\n";
print OUT "# Initial chains:        $tchains\n";
print OUT "# Initial patches:       $tpatches\n";
print OUT "# Final clusters:        $tclusters\n";
foreach $p (@clusters) {
   print OUT "$p\t@{$p}\n";
}
close(OUT);

#---- Disconnect from the database
$dbh1->disconnect ();
$dbh2->disconnect ();
exit;
#####################

#End
exit;

sub by_num  { $a <=> $b }
sub by_numr { $b <=> $a }

sub QueryData {
   my $dbh = shift;
   my $q   = shift;
   my @results = ();
   my @data = ();
   $sth = $dbh->prepare("$q") or print "$main::label Cannot prepare query. Error $DBI::err\n";
   $sth->execute () or print "$main::label Cannot execute query. Error $DBI::err Query: [$q]\n";
   while (@results = $sth->fetchrow_array ()) {
      push(@data,join(" ",@results));
   }
   $sth->finish ();
   return(@data);
}
1;

sub FromPatchFile {
   my $file = shift;
   my ($c,$nu,$a,$b,@p,%r,@cs);
   open(INP,"<$file");
   while (<INP>) {
      chomp;
      ($a,$b) = split(/\s+/,$_); 
      $b =~ s/[A-Z]//g; # clean characters in the interger part of the PDB residue number
      ($c,$nu) = split(/:/,$a);
      push(@p,$a); 
      push(@cs,$c); 
      $r->{$c}{$a} = $b;
   }
   close(INP);
   @cs = &Unique(@cs);
   return(\@cs,\@p,$r);
}
1;

sub Unique {
   @in = @_;
   undef %saw;
   @saw{@in} = ();
   @out = sort by_num keys %saw;
   return(@out);
}
1;

sub ArrayUniIntDiff {
   my $ra = shift; @a = @$ra;
   my $rb = shift; @b = @$rb;
   my ($e,%count,@union,@isect,@diff);

   foreach $e (@a, @b) { $count{$e}++ }
   foreach $e (keys %count) {
      push(@union, $e);
      if ($count{$e} == 2) {
         push @isect, $e;
      } else {
         push @diff, $e;
      }
   }
   # Return
   return(\@union,\@isect,\@diff);
}
1;

sub GetSeq {
   my $chain = shift;
   my $db    = "/diva2/home/marcius/DBAli.v2/chains/";
   my $file  = $db."/".substr($chain,0,2)."/".$chain.".seq";
   my %seq = {}; my @res = ();
   open(F,"<$file");
   while (<F>) {
      chomp;
      my ($nu,$n,$a) = split(/\s+/,$_);
      $seq->{$chain}->{$n} = $a;
      push(@res,$n);
   }
   close(F);
   return($seq,@res);
}
1;

