 #!/usr/bin/perl

use strict;
use FileHandle;
use DBI;

#todo
#determine error cases
#if uniprot has multiple GIs, see which one has best modbase map (or maybe just print out everything to see if there is a big difference)
#determine how to represent accessions that were lost because mapping wasn't available
#figure out which accessions could not be mapped; make sure I am doing everything correctly in previous steps.  Currently 15 out of 66 couldn't be mapped.
#incorporate TC map
#make sure that it is definitely the case that all GI accessions were mapped to SMPs


#old input files
my $globalModbaseMapFile = "/trombone1/home/dbarkan/Schisto/Mapping/modbaseGlobal/data/smpModbaseMap.txt";
my $wilsonToSmpFile = "/trombone1/home/dbarkan/Schisto/Mapping/wilsonToSmp/data/wilsonToSmpMap.txt";
my $gbToGiFile = "/trombone1/home/dbarkan/Schisto/Mapping/gbAndUpToGi/data/gbToGiMap.tab";
my $tcToSmpFile = "/trombone1/home/dbarkan/Schisto/Mapping/tigrToSmp/data/tigrToSmpMap.txt";
my $giToSmpMapFile = "/trombone1/home/dbarkan/Schisto/Mapping/smpToGi/data/smpToGiMap.txt";

my $curwenAccessionFile = "/trombone1/home/dbarkan/Schisto/Filters/Cercaria/data/Curwen06/Curwen06Data.txt";
my $knudsenLipidAccessionFile = "/trombone1/home/dbarkan/Schisto/Filters/Cercaria/data/Knudsen05/knudsen_lipid.txt";
my $knudsenNoInductionAccessionFile = "/trombone1/home/dbarkan/Schisto/Filters/Cercaria/data/Knudsen05/knudsen_no_induction.txt";
my $knudsenAcetabularAccessionFile = "/trombone1/home/dbarkan/Schisto/Filters/Cercaria/data/Knudsen05/knudsen_acetabular.txt";
my $hansellAccessionsFile = "/trombone1/home/dbarkan/Schisto/Filters/Cercaria/data/Hansell08/hansellData.txt";


#output file
my $filterOutputFile = "/trombone1/home/dbarkan/Schisto/Filters/Cercaria/data/cercariaFilter_new.txt";
my $filterFh = FileHandle->new(">" . $filterOutputFile) || die "could not open $filterOutputFile for writing\n";



my @columnHeaders = ("Modbase Sequence Id", "Source Database Name", "Source Database Accession Id", "Mapped SMP Id", "Mapping Sequence Identity", "Mapping Coverage", "Mapping % Unaligned SMP", "Notes", "Data Sources", "Protein Name");

my $statistics;

#load mapping
my $globalModbaseMap = &loadGlobalModbaseMap($globalModbaseMapFile);
my $giToSmpMap = &loadGiToSmpMap($giToSmpMapFile);
my $gbToGiMap = &loadGbToGiMap($gbToGiFile);
my $wilsonToSmpMap = &loadWilsonToSmpMap($wilsonToSmpFile);
my $giToSmpMap = &loadGiToSmpMap($giToSmpMapFile);

my $tcToSmpMap = &loadTcToSmpMap($tcToSmpFile);


#load Datasets
my $curwenAccessions = &loadCurwenAccessions($curwenAccessionFile);
my $knudsenLipidAccessions = &loadKnudsenAccessions($knudsenLipidAccessionFile);
my $knudsenNoInductionAccessions = &loadKnudsenAccessions($knudsenNoInductionAccessionFile);
my $knudsenAcetabularAccessions = &loadKnudsenAccessions($knudsenAcetabularAccessionFile);
#my $hansellAccessions = &loadHansellAccessions($hansellAccessionsFile);

my $allData;

#transfer all accessions from loaded map to the $allData hash
#$allData looks like $allData->{sourceDb}->{accessionNumber}->{dataSourceName} = proteinName

$allData = &addAccessionsToFilter($allData, $curwenAccessions, "curwen06");
$allData = &addAccessionsToFilter($allData, $knudsenLipidAccessions, "knudsenLipid");
$allData = &addAccessionsToFilter($allData, $knudsenNoInductionAccessions, "knudsenNoInduction");
$allData = &addAccessionsToFilter($allData, $knudsenAcetabularAccessions, "knudsenAcetabular");
#$allData = &addAccessionsToFilter($allData, $hansellAccessions, "hansell");

print STDERR "Mapping Genbank accessions from all data sources and writing to filter at $filterOutputFile\n";

my $filterString = "";

#GB Accessions: GB --> GI --> Modbase
my $gbAccessions = $allData->{"gb"};
foreach my $gbAccession (keys %$gbAccessions){
    my ($modbaseSeqId, $smpId) = &mapGbAccession($gbAccession, $globalModbaseMap, $giToSmpMap, $gbToGiMap, $statistics);  

    if ($modbaseSeqId){
	my $dataSources = $gbAccessions->{$gbAccession};

	my $notesString = "na";
	$filterString .= &printFilterLine("gb", $modbaseSeqId, $gbAccession, $smpId, $notesString, $dataSources, $filterFh); #output
	$statistics->{gbAccessionCount}++;
    }
}

#UP accessions: UP --> Modbase
print STDERR "Mapping uniprot accessions from all data sources and writing to filter at $filterOutputFile\n";
my $upAccessions = $allData->{"up"};  #uniprot
foreach my $upAccession(keys %$upAccessions){

    my ($modbaseSeqId, $smpId) = &mapUpAccession($upAccession, $globalModbaseMap, $statistics);
    if ($modbaseSeqId){
	my $dataSources = $upAccessions->{$upAccession};
	my $notesString = "na";
	$filterString .= &printFilterLine("up", $modbaseSeqId, $upAccession, $smpId, $notesString, $dataSources, $filterFh);
	$statistics->{upAccessionCount}++;
    }
}

my $tcAccessions = $allData->{"tc"};
foreach my $tcAccession (keys %$tcAccessions){
    my ($modbaseSeqId, $smpId, $notesString) = &mapTcAccession($tcAccession, $globalModbaseMap, $tcToSmpMap, $statistics);
    if ($modbaseSeqId){
	my $dataSources = $tcAccessions->{$tcAccession};
	$filterString .= &printFilterLine("tc", $modbaseSeqId, $tcAccession, $smpId, $notesString, $dataSources, $filterFh);
	$statistics->{tcAccessionCount}++;
    }
}


#SN Accessions: SN --> SMP --> modbase
my $snAccessions = $allData->{"sn"};
foreach my $snAccession (keys %$snAccessions){
    my $smpAccession = $wilsonToSmpMap->{$snAccession};
    if ($smpAccession){
	my $modbaseSeqId = $globalModbaseMap->{$smpAccession};
	if ($modbaseSeqId){
	    my $dataSources = $snAccessions->{$snAccession};
	    my $notesString = "na";
	    $filterString .= &printFilterLine("smp", $modbaseSeqId, "n/a", $smpAccession, $notesString, $dataSources, $filterFh);
	    $statistics->{snCount}++;
	}
	else {
	    print STDERR "could not map to modbase sequence id from smp $smpAccession (mapped from sn accession $snAccession)\n";
	    $statistics->{noModbaseForSmpCount}++;
	}
    }
    else {
	print STDERR "could not map to smp id from snap accession $snAccession\n";
	$statistics->{noSmpForSnCount}++;
    }
}
    
#SMP Accessions: SMP --> modbase
my $smpAccessions = $allData->{"smp"};  #smp
foreach my $smpAccession (keys %$smpAccessions){
    my $modbaseSeqId = $globalModbaseMap->{$smpAccession};
    if ($modbaseSeqId){
	my $dataSources = $smpAccessions->{$smpAccession};
	my $notesString = "na";
	$filterString .= &printFilterLine("smp", $modbaseSeqId, "n/a", $smpAccession, $notesString, $dataSources, $filterFh);
	$statistics->{smpCount}++;
    }
    else {
	print STDERR "could not map to modbase sequence id from smp $smpAccession\n";
	$statistics->{noModbaseForSmpCount}++;
    }
}


#finalize, print stats

my $gbCount = $statistics->{gbAccessionCount};
my $upCount = $statistics->{upAccessionCount};
my $smpCount = $statistics->{smpCount};
my $snCount = $statistics->{snCount};

my $noGiForGbCount = $statistics->{noGiForGbCount};
my $noSmpForGiCount = $statistics->{noSmpForGiCount};
my $noModbaseForSmpGiCount = $statistics->{noModbaseForSmpGiCount};
my $noModbaseForUpCount = $statistics->{noModbaseForUpCount};
my $noModbaseForSmpCount = $statistics->{noModbaseForSmpCount};
my $noSmpForTcCount = $statistics->{noSmpForTcCount};
my $noModbaseForSmpTcCount = $statistics->{noModbaseForSmpTcCount};


my ($sec,$min,$hour,$mday,$mon,$year,$wday, $yday,$isdst)=localtime(time);

my $dateLine = sprintf ("%4d-%02d-%02d %02d:%02d:%02d", $year+1900,$mon+1,$mday,$hour,$min,$sec);

print $filterFh "#$dateLine\n";

print $filterFh "#Filter Stats:  Added $gbCount modbase sequences from genbank sources to filter, $upCount sequences from uniprot sources, $snCount from 'snap' IDs, and $smpCount sequences directly from smp sources\n";
print $filterFh "#$noGiForGbCount sequences could not be mapped from gbs to gis\n";
print $filterFh "#$noSmpForGiCount sequences could be mapped from gb to gi, but were not subsequently mapped to SMP ids\n";
print $filterFh "#noModbaseForSmpGiCount sequences could be mapped from gb to gi to smp, but were not subsequently mapped to modbase ids\n";
print $filterFh "#$noModbaseForUpCount sequences could not be mapped from uniprot IDs to modbase seq ids\n";
print $filterFh "#$noModbaseForSmpCount sequences could not be mapped from primary SMP sources to modbase sequences\n";
print $filterFh "#$noSmpForTcCount sequences could not be mapped from TC annotations to SMP sources\n";
print $filterFh "#$noModbaseForSmpTcCount sequences were mapped from TC to SMP but could subsequently not be found in modbase\n";

my $columnHeaderString = "#" . join("\t", @columnHeaders) . "\n";
print $filterFh $columnHeaderString;

print $filterFh $filterString . "\n";




####################################################################################
##############################   Subroutines #######################################
####################################################################################
sub loadGlobalModbaseMap{

    my ($mapFile) = @_;
    my $fh = FileHandle->new("<" . $mapFile) || die "could not open smp modbase map file $mapFile\n";
    my $map;
    while (<$fh>){
	chomp;
	my $line = $_;
	my ($modbaseSeqId, $dbName, $dbId) = split('\t', $line);
	$map->{$dbId} = $modbaseSeqId;
	$map->{$modbaseSeqId}->{$dbName}->{$dbId} = 1;
    }
    return $map;
}


sub loadGbToGiMap {
    my ($gbToGiFile) = @_;

    print STDERR "Loading gb to genbank map from $gbToGiFile\n\n";

    my $fh = FileHandle->new("<" . $gbToGiFile) || die "could not open $gbToGiFile\n";

    my $gbToGiMap;

    while (<$fh>){
	chomp;
	my $line = $_;
	
	my ($gbId, $gi) = split('\t', $line);
#	$gbToGiMap->{$gbId}->{$gi} = 1;
	$gbToGiMap->{$gbId} = $gi;
    }

    my $gbCount = scalar(keys %$gbToGiMap);

    print STDERR "Done loading gb to genbank map.  Loaded $gbCount entries\n\n";

    return $gbToGiMap;
}

sub loadWilsonToSmpMap{
    my ($mapFile) = @_;
    my $map;
    my $fh = FileHandle->new("<" . $mapFile) || die "could not open wilson map file $mapFile\n";
    while (<$fh>){
	chomp;
	my $line = $_;
	my ($snId, $smpId) = split('\t', $line);
	$map->{$snId} = $smpId;
    }
    return $map;
}
sub loadGiToSmpMap{
    my ($giToSmpMapFile) = @_;
    my $fh = FileHandle->new("<" . $giToSmpMapFile) || die "could not open gi to smp map $giToSmpMapFile\n";
    my $map;
    while (<$fh>){
	chomp;
	my $line = $_;
	my ($smpId, $gi, $status) = split('\t', $line);
	if ($status eq "valid"){
	    $map->{$gi} = $smpId;
	}
    }
    return $map;
}

sub loadTcToSmpMap{

    my ($mapFile) = @_;
    my $fh = FileHandle->new("<" . $mapFile) || die "could not open tigr to smp map file $mapFile\n";

    my $mapFile;
    while (<$fh>){
	chomp;
	my $line = $_;
	my @cols = split('\t', $line);
	$mapFile->{$cols[0]} = $line;
    }
	
    return $mapFile;
}

sub loadDbh{

    my $dbString = "DBI:mysql:database=modbase:hostname=modbase";
    my $username = "modbase";
    my $password = "modbasesecret";
    
    print STDERR "connecting\n";
    
    my $dbh = DBI->connect( $dbString, $username, $password, {RaiseError => 1});
    
    return $dbh;
}

sub loadCurwenAccessions{
    my ($fileName) = @_;
    my $fh = FileHandle->new("<" . $fileName) || die "couldn't open $fileName\n";

    print STDERR "Loading Curwen filter accessions from $fileName\n\n";

    my $curwenMap;

    while (<$fh>){
	chomp;
	my $line = $_;
	next if ($line =~ /^Spot/);  #header line
	my @cols = split('\t', $line);
	
	my $name = $cols[2];
	my $sourceDb = $cols[3];

	my $accession = $cols[4];
	$curwenMap->{$sourceDb}->{$accession} = $name;
    }

    my $curwenCount = &countAccessionsFromAllDbs($curwenMap);   #helper method for stats
    print STDERR "Done loading Curwen accessions.  Loaded $curwenCount entries\n\n";

    return $curwenMap;
}

sub loadKnudsenAccessions{
    my ($fileName) = @_;

    my $fh = FileHandle->new("<" . $fileName) || die "couldn't open $fileName\n";

    my $knudsenMap;

    while (<$fh>){
	chomp;
	my $line = $_;
	next if ($line =~ /^PROTEIN/);
	my @cols = split('\t', $line);
	
	my $name = $cols[0];
	my $sourceDb = $cols[1];
	my $accession = $cols[2];
	next unless ($name =~  /\[Sm\]/);  
	#knudsen has multiple species represented due to experimental issues.  This is tracked with [Sm] (or other species) in protein name
	$knudsenMap->{$sourceDb}->{$accession} = $name;
    }
    
    my $count = &countAccessionsFromAllDbs($knudsenMap);  #helper method for stats
    print STDERR "Done loading Knudsen accessions.  Loaded $count entries\n\n";

    return $knudsenMap;
}


sub loadHansellAccessions{
    my ($fileName, $validSourceDbs) = @_;
    
    my $fh = FileHandle->new("<" . $fileName) || die "Could not open $fileName\n";
    my $hansellMap;
    while (<$fh>){
	chomp;
	my $line = $_;
	next if ($line =~ /\#/);
	my @cols = split('\t', $line);
	my $id = $cols[0];
	my $sourceDb;
	if ($id =~ /^Smp/){
	    $id = $cols[1];
	    $sourceDb = "smp";
	}
	else {
	    $sourceDb = "sp";
	}
	my $name = $cols[4];
	print STDERR "Hansell map: Got id $id source db $sourceDb name $name\n";
	$hansellMap->{$sourceDb}->{$id} = $name;
    }
    
    return $hansellMap;

}


sub countAccessionsFromAllDbs{
    my ($map) = @_;
    my $count = 0;
    foreach my $sourceDb (keys %$map){
	my $accessions = $map->{$sourceDb};
	my $accessionCount = scalar (keys %$accessions);
	$count += $accessionCount;
    }
    return $count;

}

#easy transfer to combine all loaded accessions into one hash.  Tracks the data sources that contributed to this protein being in the filter
sub addAccessionsToFilter{
    my ($allData, $accessions, $dataSourceName) = @_;

    print STDERR "Adding accessions loaded from $dataSourceName to final filter\n\n";

    foreach my $sourceDb (keys %$accessions){
	my $dbAccessions = $accessions->{$sourceDb};
	
	foreach my $dbAccession (keys %$dbAccessions){
	    my $proteinName = $dbAccessions->{$dbAccession};
	    $allData->{$sourceDb}->{$dbAccession}->{$dataSourceName} = $proteinName;   
	}
    }
    return $allData;
}

sub mapUpAccession{

    my ($upAccession, $globalModbaseMap, $statistics) = @_;

    my $modbaseSeqId = $globalModbaseMap->{$upAccession};
    my $smpId = "not_found";
    if ($modbaseSeqId){
	my $smpIds = $globalModbaseMap->{$modbaseSeqId}->{"CU"};
	
	#get smp id for this upId / modbaseSeqId
	if ($smpIds){
	    my @smpIdList = keys %$smpIds;
	    $smpId = $smpIdList[0];
	}
    }
    else {
	$statistics->{noModbaseForUpCount}++;
	print STDERR "could not map to modbase seq id from up $upAccession\n";
    }
    return ($modbaseSeqId, $smpId);
}

#Gets the gi number for this gb accession, and from that gets the modbase sequence id 
sub mapGbAccession{

    my ($gbAccession, $globalModbaseMap, $giToSmpMap, $gbToGiMap, $statistics) = @_;

    my $gi = $gbToGiMap->{$gbAccession};
    my $modbaseSeqId;
    my $smpId = "not_found";
    if ($gi){
	my $smpId = $giToSmpMap->{$gi};
	if ($smpId){
	    $modbaseSeqId = $globalModbaseMap->{$smpId}; 
	    unless ($modbaseSeqId){
		$statistics->{noModbaseForSmpGiCount}++;
		print STDERR "could not map to modbase seq id from smp $smpId after gi map\n";
	    }
	}
	else {
	    $statistics->{noSmpForGiCount}++;
	    
	    print STDERR "could not map to smp from gi $gi\n";
	}
    }
    else {
	$statistics->{noGiForGbCount}++;
    }
    return ($modbaseSeqId, $smpId);
}

sub mapTcAccession{
    my ($tcAccession, $globalModbaseMap, $tcToSmpMap, $statistics) = @_;
    my $tcInfoLine = $tcToSmpMap->{$tcAccession};
    unless ($tcInfoLine){
	$statistics->{noSmpForTcCount}++;
	return (0, 0, 0);
    }
    
    my ($oldTc, $smp, $newTc, $splitMerge, $significance, $length) = split('\t', $tcInfoLine);

    my $modbaseSeqId = $globalModbaseMap->{$smp};
    unless ($modbaseSeqId){
	$statistics->{noModbaseForSmpTcCount}++;
    }

    my $notesString =  "$newTc $splitMerge $significance $length";
    return ($modbaseSeqId, $smp, $notesString);
}


#simple output method
sub printFilterLine{  

    my ($sourceDatabaseName, $modbaseSeqId, $accessionId, $smpId, $notesString, $dataSources, $filterFh) = @_;

    my $line = "$modbaseSeqId\t$sourceDatabaseName\t$accessionId\t$smpId\t$notesString\t";

    my @dataSourceList = keys %$dataSources;
    my $proteinName = $dataSources->{$dataSourceList[0]};  #grab first protein name to use here

    my $dataSourceString = join(', ', @dataSourceList);

    $line .= "$dataSourceString\t$proteinName\n";
    return $line;
}






