#!/usr/bin/perl

use strict;
use FileHandle;
use Getopt::Long;


my ($massSpecFile, $uniprotFile, $outputFile, $accessionColumn, $peptideColumn, $peptideStartColumn, $goTermListFile, $help);

my $INTRACELLULAR = "intracellular";
my $EXTRACELLULAR = "extracellular";

my $options = GetOptions("mass_spec_file=s" => \$massSpecFile,
			 "uniprot_file=s" => \$uniprotFile,
			 "output_file=s" => \$outputFile,
			 "accession_column=i" => \$accessionColumn,
			 "peptide_column=i" => \$peptideColumn,
			 "peptide_start_column=i" => \$peptideStartColumn,
			 "go_term_list_file=s" => \$goTermListFile,
			 "help" => \$help,
    );

unless ($massSpecFile && $uniprotFile && $outputFile && $accessionColumn && $peptideColumn && $peptideStartColumn && $goTermListFile){

    if ($help){
	&outputHelp();
	exit(0);
    }
    
    print STDERR "\nUsage: append_ptm_in_cell.pl\n\n";
    print STDERR "Reads in mass spec file and uniprot annotation, and appends uniprot annotation regarding broad cellular location of each peptide\n\n";
    print STDERR "Run with --help for full usage and file specs\n\n";
    print STDERR "Command line options:\n";
    print STDERR "\t--mass_spec_file\tname of the mass spec file\n\n";
    print STDERR "\t--uniprot_file\t\tname of uniprot annotation file\n\n";
    print STDERR "\t--output_file\t\tname of the output file\n\n";
    print STDERR "\t--accession_column\tColumn number of the uniprot accession in the mass spec file\n\n";
    print STDERR "\t--peptide_column\tColumn number of the peptide sequence in the mass spec file\n\n";
    print STDERR "\t--peptide_start_column\tColumn number representing the protein sequence position of the first residue of the peptide\n\n";
    print STDERR "\t--go_term_list_file\tfile containing list of go accessions and localization information\n\n";

    print STDERR 
    exit(1);
}

$peptideColumn--;
$peptideStartColumn--;
$accessionColumn--;


my $outputFh = FileHandle->new(">" . $outputFile) || die "could not open output file $outputFile for writing: $!\n";

my $perlCmd = "perl -p -i -e \"s/\015/\n/g\" $massSpecFile";
system($perlCmd); 

#$absInfo: hash keyed on file lines, contains uniprot accession and residue position of each PTM found in the peptide for that line
my $absInfo = &readMsFile($massSpecFile, $peptideColumn, $peptideStartColumn, $accessionColumn, $outputFh);


#$accFeatures: hash keyed on Uniprot feature types ("CC", "FT", "DR"), maps each accession to its localization based on calls from these types
my $accFeatures = &readUniprotAnnotation($uniprotFile);

&appendLocation($absInfo, $accFeatures, $goTermListFile, $outputFh);

#use all of this to annotate files and analyze galnac stuff
#make sure there is always a go term in the dictionary present (no annotation with only empty strings)

#Process Uniprot annotation file downloaded from uniprot site
sub readUniprotAnnotation{
    my ($uniprotFile) = @_;

    my $fh = FileHandle->new("<" . $uniprotFile) || die "could not open uniprot file $uniprotFile: $!\n";

    my $accFeatures;
    my $briefAnnotations;
        
    while (<$fh>){
	chomp;
	my $line = $_;
	$line =~ /^(..)/;  #get first two letters of line; uniprot info tag
	my $tag = $1;

	if ($tag eq "AC"){  #found next accession
	    my @currentAccessions;
	    my @cols = split('\s+', $line);
	    for (my $i = 1; $i < scalar(@cols); $i++){
		my $nextAcc = $cols[$i];
		$nextAcc =~ s/\s//g;
		$nextAcc =~ s/\;//g;
		push (@currentAccessions, $nextAcc);
		#add each accession to list of ones we are currently processing. All will be tagged with the annotation for full coverage
	    }
	    my $refAcc = $currentAccessions[0];
	    #read all information for this accession
	    while (<$fh>){
		chomp;
		$line = $_;
		$line =~ /^(..)/;  #get first two letters of line; uniprot info tag
		my $tag = $1;
		last if ($tag eq "//");  #end of information for this accession
		
		#Process topological domain annotation
		if ($tag eq "FT"){  
		    my @cols = split ('\s+', $line);
		    my $featureType = $cols[1];
		    if ($featureType eq "TOPO_DOM" || $featureType eq "TRANSMEM"){  #features we're looking for
			my $start = $cols[2];
			my $end = $cols[3];
			my $colSize = scalar(@cols) - 1;
			my @location = @cols[4, $colSize];
			my $location = join(' ', @location);
			$briefAnnotations->{$location}++;
			foreach my $accession (@currentAccessions){  #add topological annotation to all current accessions
			    $accFeatures->{$tag}->{$accession}->{$start}->{end} = $end;          
			    $accFeatures->{$tag}->{$accession}->{$start}->{location} = $location;
			}
		    }
		}

		#Comment section; may contain information on localization
		if ($tag eq "CC"){
		    if ($line =~ /SUBCELLULAR LOCATION/){
			my $fullAnnotation = $line;
			my $nextLine = <$fh>;
			chomp $nextLine;
			while (!($nextLine =~ /\-\-\-\-\-/) && !($nextLine =~ /\-\!\-/)){  #-!- means found next comment, ---- means end of all comments

			    #Comment can span multiple lines. Parse out everything after the CC tag
			    if ($nextLine =~ /^CC\s+(.*)/){
				$fullAnnotation .= " " . $1;
			    }
			    else {
				die "did not get expected format when reading comments for accession $refAcc (found line $nextLine)\n";
			    }
			    $nextLine = <$fh>;
			    chomp $nextLine;
			}

			$fullAnnotation =~ /SUBCELLULAR LOCATION\: (.*?)\./;
			my $briefAnnotation = $1;

			#$briefAnnotations->{$briefAnnotation}++;
			foreach my $accession (@currentAccessions){  #add comment annotation to all current accessions
			    
			    $accFeatures->{$tag}->{$accession}->{$briefAnnotation} = 1;

			}
		    }
		}

		#Process GO annotation
		if ($tag eq "DR"){
		    if ($line =~ /GO\; (GO\:\d+)\; C\:(.*)\;/){
			
			my $annotatedGoId = $1;
			my $goName = $2;
			foreach my $accession (@currentAccessions){  #add GO annotation to all current accessions
			    
			    $accFeatures->{$tag}->{$accession}->{$annotatedGoId} = 1;
			}
		    }
		}
	    } #end current accession
	}
    }

    #list of how many times each comment about localization occurs
    foreach my $brief (sort {$briefAnnotations->{$a} <=> $briefAnnotations->{$b}} keys %$briefAnnotations){
	my $count = $briefAnnotations->{$brief};
	#print STDERR $brief . "\t" . $count . "\n"; 
    }
    return $accFeatures;
}


sub readMsFile{
    my ($fileName, $peptideColumn, $peptideStartColumn, $accessionColumn, $outputFh) = @_;
    my $fh = FileHandle->new("<" . $fileName) || die "could not open mass spec file $fileName: $!\n";
    my $counter = 0;
    my $info;
    my $absInfo;
    my $uniqueAccessions;
    while (<$fh>){
	chomp;
	my $line = $_;
	if ($counter == 0){  #File column header. Output line exactly, add two columns for results of this script
	    $counter++;
	    $line =~ s/\s+$//;
	    
	    print $outputFh $line ."\tAnnotated Localization\tAnnotation Type\n";
	    next;
	}
	$counter++;
	my @cols = split('\t', $line);
	my $uniprot = $cols[$accessionColumn];
	
	my $peptide = $cols[$peptideColumn];
	my $startPosition = $cols[$peptideStartColumn];


	$peptide =~ s/Acetyl\-//g;  #if any modification other than Acetyl- is in the peptide, and is not in ()'s, it needs to be added here

	my $modifications = &getModifications($peptide, $uniprot, $startPosition); #read the peptide, parse out the () in which is contained the PTMs
                                                                                   #save the ones we're interested in (either glcnac or phospho)

	#strip out all modifications so that peptide is just residues sequence
	$peptide =~ s/\(.*?\)//g;

	$absInfo->{$line}->{uniprot} = $uniprot;
	$uniqueAccessions->{$uniprot} = 1;
	#save peptide, modification type, absolute position (base-0)
	foreach my $index (keys %$modifications){
	    my $modification = $modifications->{$index};

	    my $absPosition = $startPosition + $index - 2;  # -1 to convert to base 0, -1 because start position shouldn't be counted as the first position
	    $absInfo->{$line}->{positions}->{$absPosition} = 1;
	}
    }
    my $accessionCount = scalar (keys %$uniqueAccessions);
    print STDERR "read $counter lines containing $accessionCount unique accessions from $fileName\n";
    return $absInfo;
}


sub getModifications{
    my ($peptide, $uniprot, $startPosition) = @_;
    my @peptideArray = split('', $peptide);
    my $residueCounter = 0;
    my $modifications;
    for (my $i = 0; $i < scalar(@peptideArray); $i++){
	my $nextChar = $peptideArray[$i];
	if ($nextChar eq '('){ #found a PTM, see what's inside the ()'s by taking the substring of $peptide bounded by the positions of the ()'s.
	    
	    my $start = $i + 1;
	    while ($nextChar ne ')'){  #keep incrementing $i until we get to )
		$i++;
		$nextChar = $peptideArray[$i];
	    }
	    my $length = $i - $start;
	    my $modification = substr($peptide, $start, $i - $start);
 	    if ($modification eq "Phospho" || $modification eq "HexNAc"){ #if the format of these tags changed, this will need to be modified

		$modifications->{$residueCounter} = $modification;  #save modifications in hash keyed on the index of the PTM'd residue as it appears in $peptide (base-1)
	    }
	}
	else {
	    $residueCounter++;
	}
    }

    return $modifications;
}

sub appendLocation{
    my ($absInfo, $accFeatures, $goFileName, $outputFh) = @_;

    my $topologyAnnotation = &getTopologyAnnotation();
    my $commentRegions = &getCommentRegions();
    
    #$goTerms: hash keyed on GO IDs; value is the localization value
    my $goTerms = &readGoTermListFile($goTermListFile);

    my $unknownTopology;
    my $unknownGo;
    my $unknownComments;
    
    #stats
    my ($topologyCount, $extracellularTopologyCount, $unknownTopologyCount, $goCount, $extracellularGoCount, $unknownGoCount);
    my ($commentCount, $extracellularCommentCount, $unknownCommentCount, $noAnnotationCount, $unknownAnnotationCount);

    #for each line, get its accession, check all uniprot annotation, see what localization call is, output
    #process uniprot in order of 'FT' (toplogical annotation), 'DR' (GO Terms), 'CC' (comments). If one is found, move on to the next line
    foreach my $line (sort {$absInfo->{$a}->{uniprot} cmp $absInfo->{$b}->{uniprot}} keys %$absInfo){
	
	my $accession = $absInfo->{$line}->{uniprot};
	
	my $positions = $absInfo->{$line}->{positions};
	
	#FT / topology
	my $topology = $accFeatures->{"FT"}->{$accession};
	my $topologyLocationData;   #$locationData->{extracellular} = tag type / tag

	foreach my $position (keys %$positions){
	    foreach my $topologyStart (keys %$topology){
		my $topologyEnd = $topology->{$topologyStart}->{end};
		if ($topologyStart <= $position && $position <= $topologyEnd){
		    
		    #get annotated location of this region
		    my $location = $topology->{$topologyStart}->{location};
		    $location =~ /(^\S+)/;
		    my $locationFirstWord = $1;
		    my $topologicalRegion = $topologyAnnotation->{$locationFirstWord};

		    unless ($topologicalRegion){
			$topologicalRegion = "unknown";
			$unknownTopology->{$location}++;
		    }
		    
		    $topologyLocationData->{$topologicalRegion}->{$location} = 1;
		    
		}
	    }
	}
	    
	my ($foundTopology, $foundExtracellular, $onlyUnknownTopology) = &processLocationData($line, $topologyLocationData, "topology");
	$topologyCount += $foundTopology;
	$extracellularTopologyCount += $foundExtracellular;
	$unknownTopologyCount += $onlyUnknownTopology;
	next if $foundTopology;

	#DR / GO	
	my $goLocationData;
	my $go = $accFeatures->{"DR"}->{$accession};
	
	foreach my $goId (keys %$go){
	    next unless ($goId =~ /\S/);
	    my $goRegion = $goTerms->{$goId};
	    unless ($goRegion){
		$goRegion = "unknown";
		$unknownGo->{$goId}++;
	    }
	    $goLocationData->{$goRegion}->{$goId} = 1;
	}
	
	my ($foundGo, $foundExtracellularGo, $onlyUnknownGo) =  &processLocationData($line, $goLocationData, "GO");
	$goCount += $foundGo;
	$extracellularGoCount += $foundExtracellularGo;
	$unknownGoCount += $onlyUnknownGo;
	next if $foundGo;

	# CC / Comments
	my $commentLocationData;
	my $comments = $accFeatures->{"CC"}->{$accession};
	foreach my $comment (keys %$comments){
	    my 	$commentRegion = $commentRegions->{$comment};
	    unless ($commentRegion){
		$unknownComments->{$comment}++;
		$commentRegion = "unknown";
	    }
	    $commentLocationData->{$commentRegion}->{$comment} = 1;
	}
	my ($foundComment, $foundExtracellularComment, $onlyUnknownComment) =  &processLocationData($line, $commentLocationData, "comments");
	$commentCount += $foundComment;
	$extracellularCommentCount += $foundExtracellularComment;
	$unknownCommentCount += $onlyUnknownComment;
	next if ($foundComment);
	my $defaultTag;
	if ($onlyUnknownTopology || $onlyUnknownGo || $onlyUnknownComment){
	    $defaultTag = "unknown annotation";
	    $unknownAnnotationCount++;
	}
	else {
	    $defaultTag = "no annotation";
	    $noAnnotationCount++;
	}

	print $outputFh $line . "\t$defaultTag\n";
    }

    my $totalAnnotation = $topologyCount + $goCount + $commentCount;

    print STDERR "Lines annotated with topology information: $topologyCount\n";
    print STDERR "Of these, number that were extracellular: $extracellularTopologyCount\n";
    print STDERR "Lines that were annotated with topology information that wasn't in any localization: $unknownTopologyCount\n\n";

    print STDERR "Lines annotated with go information: $goCount\n";
    print STDERR "Of these, number that were extracellular: $extracellularGoCount\n";
    print STDERR "Lines that were annotated with GO terms that weren't in any localization: $unknownGoCount\n\n";

    print STDERR "Lines annotated with comments: $commentCount\n";
    print STDERR "Of these, number that were extracellular: $extracellularCommentCount\n";
    print STDERR "Lines that were annotated with comments that weren't in any localization: $unknownCommentCount\n\n";
    
    print STDERR "Lines that had at least one form of annotation: $totalAnnotation\n";
    print STDERR "Lines that had only 'unknown' annotation (not tagged as intra- or extracellular): $unknownAnnotationCount\n";
    print STDERR "Lines that had no annotation of any kind: $noAnnotationCount\n";
    
    my $unknownStatsFh = FileHandle->new(">unknownAnnotationList") || die "could not open unknownAnnotationList for writing: $!\n";
    &outputUnknown("Topology", $unknownTopology, $unknownStatsFh);
    &outputUnknown("GO", $unknownGo, $unknownStatsFh);
    &outputUnknown("Comments", $unknownComments, $unknownStatsFh);
    
}


sub outputUnknown{
    my ($type, $unknown, $fh) = @_;

    print $fh "Annotations of type $type that were not tagged as 'intracellular' or 'extracellular' (number in ())\n";
    foreach my $brief (sort {$unknown->{$b} <=> $unknown->{$a}} keys %$unknown){
	my $count = $unknown->{$brief};
	print $fh $brief . "\t" . $count . "\n"; 
    }
    print $fh "\n";
}

sub processLocationData{
    my ($line, $locationData, $annotationType) = @_;

    my $foundTopology = 0;
    my $foundExtracellular = 0;
    my $onlyUnknown = 0;

    my $extracellular = $locationData->{$EXTRACELLULAR};
    my $intracellular = $locationData->{$INTRACELLULAR};
    my $unknown = $locationData->{"unknown"};
    
    my $annotation;
    my $localization;

    if ($extracellular){
	$annotation = $annotationType . " " . join(',', keys %$extracellular);
	$foundTopology = 1;
	$foundExtracellular = 1;
	$localization = $EXTRACELLULAR;
    }
    elsif ($intracellular){
	$foundTopology = 1;
	$annotation = $annotationType . " " . join(',', keys %$intracellular);
	$localization = $INTRACELLULAR;
    }
    elsif ($unknown){
	$onlyUnknown = 1;
    }
    
    if ($foundTopology){
	$line .= "\t" . $localization . "\t" . $annotation;
	print $outputFh $line . "\n";
    }

    return ($foundTopology, $foundExtracellular, $onlyUnknown);
}
	


sub getTopologyAnnotation{
    my $topologyAnnotation;

    $topologyAnnotation->{"Cytoplasmic"} = $INTRACELLULAR;
    $topologyAnnotation->{"Mitochondrial"} = $INTRACELLULAR;
    $topologyAnnotation->{"Intragranular"} = $INTRACELLULAR;
    $topologyAnnotation->{"Lumenal"} = $INTRACELLULAR;
    $topologyAnnotation->{"Cytoplasmic."} = $INTRACELLULAR;
    $topologyAnnotation->{"Nuclear"} = $INTRACELLULAR;
    
    $topologyAnnotation->{"Extracellular"} = $EXTRACELLULAR;
    return $topologyAnnotation;

}


sub getCommentRegions{

    #list of Uniprot "comments" that follow the SUBCELLULAR LOCATION tag that indicate the protein is intracellular intracellular.
    my $comments;
    $comments->{"Cytoplasm, cytosol"} = $INTRACELLULAR;
    $comments->{"Cytoplasm, cytosol (By similarity)"} = $INTRACELLULAR;
    $comments->{"Nucleus, nuclear pore complex (By similarity)"} = $INTRACELLULAR;
    $comments->{"Cytoplasm (Probable)"} = $INTRACELLULAR;
    $comments->{"Mitochondrion"} = $INTRACELLULAR;
    $comments->{"Cytoplasm, perinuclear region (By similarity)"} = $INTRACELLULAR;
    $comments->{"Nucleus (Potential)"} = $INTRACELLULAR;
    $comments->{"Cytoplasm, cytoskeleton (By similarity)"} = $INTRACELLULAR;
    $comments->{"Nucleus (Probable)"} = $INTRACELLULAR;
    $comments->{"Cytoplasm, cytoskeleton"} = $INTRACELLULAR;
    $comments->{"Nucleus (By similarity)"} = $INTRACELLULAR;
    $comments->{"Nucleus"} = $INTRACELLULAR;
    $comments->{"Cytoplasm"} = $INTRACELLULAR;
    $comments->{"Cytoplasm (By similarity)"} = $INTRACELLULAR;


    $comments->{"Secreted, extracellular space, extracellular matrix (By similarity)"} = $EXTRACELLULAR;
    $comments->{"Secreted (By similarity)"} = $EXTRACELLULAR;
    $comments->{"Secreted, extracellular space, extracellular matrix"} = $EXTRACELLULAR;
    $comments->{"Secreted (Potential)"} = $EXTRACELLULAR;
    $comments->{"Secreted"} = $EXTRACELLULAR;

    return $comments;
}






sub readGoTermListFile{

    my ($filename) = @_;
    my $fh = FileHandle->new("<" . $filename) || die "could not open go term list file $filename: $!\n";

    my $goTerms;

    my $goCount;

    my $currentRegion = "";
    while (<$fh>){
	chomp;
	my $line = $_;
	if ($line =~ /^\*\*(\S+)/){
	    $currentRegion = $1;
	}
	my ($goTerm, $goTermName, $cc) = split('\t', $line);
	$goTerms->{$goTerm} = $currentRegion;
	$goCount++;
    }

    print STDERR "Read $goCount GO terms from $filename\n";

    return $goTerms;
}


sub outputHelp{


    my $helpString = <<HELP;

OVERVIEW
This script tags whether proteins read in from a mass-spec data file are extracellular or intracellular, 
according to Uniprot annotation. The script adds two columns to the end of the file, one for the tag itself 
and one for the type of annotation that led to the tag.

PROGRAM FLOW
There are three types of Uniprot annotation that refers to whether a protein is intracellular or extracellular. 
They are Topology, Gene Ontology, and Comments.

For each line in the mass-spec file, the script considers each of these in order. Not all annotation types 
are present for all lines. If one type of annotation is present in Uniprot, then the script appends the 
localization of the accession in that line, and ignores any annotation that might follow. 

Topology annotation is generally available for transmembrane proteins and refers to the locations on the 
protein sequence bounding extra- and intracellular, and transmembrane regions of the protein. If a mass-spec 
peptide (specifically, a post-translational modfication) is present in between the annotated regions, then 
it is tagged with that annotation. This may result in an accession having different tags across different 
peptides (one per each line in the file).

GO Annotation is from the "Cellular Component" branch of the GO Hierarchy. The provided GO Term List file 
includes the 'intracellular' and 'extracellular region' terms along with all descendents of these terms in the 
hierarchy (a descendent indicating that this term is in the same space as its ancestor, except more specific). 

Comments are rigorously structured features in Uniprot that contains different forms of annotation. One 
of these is "Subcellular Location." 

Whether a topology or comment annotation is intra- or extracellular is hardcoded in the script 
itself ('getCommentRegions()', 'getTopologyAnnotation()'. 

For all types of annotation, often it is the case that the annotation doesn't refer to an intra- or 
extracellular region (for example, for membrane proteins). These are listed as 'unknown annotation' in the 
output file. Further, some accessions have no annotation, in which case this is also listed in the output file. 
The script outputs a file indicating tags that were counted as 'unknown' and how many times each tag was present 
for the mass spec file.

The script outputs lines in order of alphabetically increasing Uniprot accessions.

INPUT FILES:

1. Uniprot annotation file (--uniprot_file). This is specific to the set of accessions in the mass-spec file. 
   To obtain this annotation file:
    a. Create a separate text file containing only the uniprot accession list, one accession per line. It is OK if 
       an accession is present twice.
    b. Go to the Uniprot website (www.uniprot.org) and click the "Retrieve" tab at the top.
    c. Download the annotation file in "Flat Text" format. This file is the one to use with this script.

2. Gene Ontology annotation file (--go_term_list_file). This file lists whether certain GO terms are intracellular or 
   extracellular by examining the GO hierarchy. It shouldn't need to be modified as the GO hierarchy doesn't often 
   change dramatically. To recreate the file:
   a. Go to the frontend Berkeley GO SQL interface (http://www.berkeleybop.org/goose)
   b. Enter the following query:
      SELECT DISTINCT descendant.acc, descendant.name, descendant.term_type
      FROM
      term
      INNER JOIN graph_path ON (term.id=graph_path.term1_id)
      INNER JOIN term AS descendant ON (descendant.id=graph_path.term2_id)
      WHERE term.name='intracellular' AND distance <> 0 ;
   c. Click the 'download results directly to tab delimited format' checkbox.
   d. In the resulting file, append the line 
      'GO:0005622	intracellular	cellular_component' (tab separated) 
   e. Additionally, in the resulting file, the line "**intracellular" should be added as the first line in the file.
   f. Repeat the query in (b) except replace 'intracellular' with 'extracellular'
   g. In the resulting file, append the line 
      'GO:0005576	extracellular region	cellular_component' (tab separated)
   h. Additionally, in the resulting file, the line "**extracellular" should be added as the first line in the file.
   i. Finally, add all lines in the extracellular file to the bottom of the intracellular file 
      (for example, with 'cat <intraFile> <extraFile> > fullGoTermFile')

3. Mass spec file (--mass_spec_file). This file is assumed to contain, in each line, a uniprot accession, a peptide, 
   and the starting position of the peptide in the protein sequence. (The columns in the file representing this data 
   are set by command line flags.) Columns in this file should be tab-separated (a Microsoft Excel file should be 
   saved in this format). Additionally, the script modifies the original input file in-place by running s/015/\\n/g, 
   which replaces the Microsoft ^M newline with one that can be read by all other text parsers.

   Each peptide is assumed to have one or more post-translational modifications labeled either as '(Phospho)' or 
   '(HexNAc)' (including the parentheses). Other post-translational modifications in parenthesis, along with the 
   'Acetyl-' modification which is often not in parenthesis, are ignored. If other modifications are present 
   besides 'Acetyl-' in the peptides that aren't in parenthesis, the script will likely crash and at the least will 
   not produce expected behavior. The first line in the mass spec file is assumed to be the column header line.


												       

HELP


print STDERR "$helpString\n";

}
