#!/usr/bin/perl

use strict;
use FileHandle;

use Getopt::Long;


my ($massSpecFile, $outputFile, $accessionColumn, $peptideStartColumn,  $modsColumn);

my $options = GetOptions("mass_spec_file=s" => \$massSpecFile,
			 "output_file=s" => \$outputFile,
			 "accession_column=i" => \$accessionColumn,
			 "peptide_start_column=i" => \$peptideStartColumn,
			 "mods_column=i" => \$modsColumn,
    );

unless ($massSpecFile  && $outputFile && $accessionColumn && $peptideStartColumn && $modsColumn){

    
    print STDERR "\nUsage: cluster_peptides.pl\n\n";
    print STDERR "\nFor each accession, clusters peptides together if they have PTMs at the same position in the peptides.\n";
    print STDERR "\nIf there are multiple PTMs between two peptides, they will only go in the same cluster if all match\n";
    print STDERR "(and each peptide has the same number).\n";
    print STDERR "If a peptide has a 'low confidence' PTM, one of the possible positions must match one postion another peptide.\n";
    print STDERR "Currently only works for 'HexNAc' modifications\n\n";
    print STDERR "Results are output as 'Peptide Number', indicating which cluster the peptide falls in (appended).\n";
    print STDERR "Additionally, outputs the global position in the protein sequence of each PTM (appended).\n\n";

    print STDERR "Command line options:\n";
    print STDERR "\t--mass_spec_file\tname of the mass spec file (tab-delimited text file)\n\n";
    print STDERR "\t--output_file\t\tname of the output file\n\n";
    print STDERR "\t--accession_column\tColumn number of the uniprot accession in the mass spec file\n\n";
    print STDERR "\t--mods_column\t\tColumn number representing the 'Mods' list in the mass spec file\n\n";
    print STDERR "\t--peptide_start_column\tColumn number representing the protein sequence position of the first residue of the peptide\n\n";
    

    print STDERR 
    exit(1);
}

$peptideStartColumn--;
$accessionColumn--;
$modsColumn--;

my $outputFh = FileHandle->new(">" . $outputFile) || die "could not open output file $outputFile for writing: $!\n";

my $perlCmd = "perl -p -i -e \"s/\015/\n/g\" $massSpecFile";
system($perlCmd);   #modify text file in place to get rid of microsoft excel newline characters

my $absInfo = &readMsFile($massSpecFile, $peptideStartColumn, $accessionColumn, $modsColumn, $outputFh);

$absInfo = &findClusteredPeptides($absInfo);

&outputNewLines($absInfo, $outputFh);



#read $absInfo for each uniprot accession, for each line (which is a peptide), see if that line matches
#another line above it in its PTMs. Must be 1-1 match if multiple PTMs. If it matches, give them the same cluster number,
#else start a new cluster with that next line.
sub findClusteredPeptides{
    my ($absInfo) = @_;

    my $currentUniprot = "";
    my $lineCounter = 0;

    my $uniprotToLines;
    my $processedLines;
    my $lcCount;
    my $hcCount;

    my $newClusters = 0;

    #take each line and transform them so they are keyed on $uniprot accessions instead of the line itself
    #This groups peptides according to their protein
    foreach my $line (sort keys %$absInfo){
	
	#check to make sure not duplicate line
	if ($processedLines->{$line}){
	    die "ERROR: line was repeated in file:\n$line\n";
	}
	$processedLines->{$line} = 1;

	my $uniprot = $absInfo->{$line}->{uniprot};
	if ($uniprot ne $currentUniprot){  #found new accession
	    $lineCounter = 1;
	    $currentUniprot = $uniprot;
	}
	else {
	    $lineCounter++;
	}
	#move all information over to $uniprotToLines
	my $mods = $absInfo->{$line}->{mods};
	my $firstMod = $absInfo->{$line}->{firstMod};
	my $multiple = $absInfo->{$line}->{multipleLc};
	
	$uniprotToLines->{$uniprot}->{$line}->{counter} =  $lineCounter;
	$uniprotToLines->{$uniprot}->{$line}->{mods} = $mods;
	$uniprotToLines->{$uniprot}->{$line}->{firstMod} = $firstMod;
	$uniprotToLines->{$uniprot}->{$line}->{multiple} = $multiple;
	
	
    }

    #do all vs all comparison of lines/peptides for each accession. If we find an existing cluster for a peptide, stop comparing and move to the next
    foreach my $uniprot (keys %$uniprotToLines){
	my $nextCluster = 1;
	my $clusteredLines;
	my $lineInfo = $uniprotToLines->{$uniprot};
	my @multipleLc;
	foreach my $line (sort {$lineInfo->{$a}->{firstMod} <=> $lineInfo->{$b}->{firstMod}} keys %$lineInfo){
	    if ($lineInfo->{$line}->{multiple}){  #multiple low-confidence accessions; push to the end
		push (@multipleLc, $line);
	    }
	    else {
		my $clusteredThisLine = 0;
		my $mods = $lineInfo->{$line}->{mods};
		my $modList = join(',', keys %$mods);
		foreach my $clusteredLine (sort {$clusteredLines->{$a} <=> $clusteredLines->{$b}} keys %$clusteredLines){ #check all lines that have already been clustered
		    my $existingCluster = $clusteredLines->{$clusteredLine};	
		    if (&sameCluster($line, $clusteredLine, $lineInfo)){
			$clusteredLines->{$line} = $existingCluster;
			$clusteredThisLine = 1;
			last;
		    }
		}

		unless ($clusteredThisLine){  #start a new one
		    $newClusters++;
		    $clusteredLines->{$line} = $nextCluster;
		    $nextCluster++;
		}
	    }
	}
	foreach my $line (@multipleLc){  #put all peptides with multiple low confidence annotations as their own cluster
	    $clusteredLines->{$line} = $nextCluster;
	    $nextCluster++;
	    $newClusters++;
	}
	
	foreach my $line (keys %$clusteredLines){  #add the actual cluster number to $absInfo
	    $absInfo->{$line}->{cluster} = $clusteredLines->{$line};
	}
    }
    my $totalLines = scalar(keys %$processedLines);
    print STDERR "Created $newClusters clusters out of $totalLines total peptides\n";

    return $absInfo;
}

#see if $testLine (which has not yet been clustered) and $referenceLine (which is in a cluster already) 
sub sameCluster{
    my ($testLine, $referenceLine, $lineInfo) = @_;
    my $referenceLineMods = $lineInfo->{$referenceLine}->{mods};
    my $testLineMods = $lineInfo->{$testLine}->{mods};
    
    my $areSame = 1;
    
    my $referenceCount = scalar(keys %$referenceLineMods);
    my $testCount = scalar(keys %$testLineMods);

    my $referenceModList = join(',', keys %$referenceLineMods);
    my $testModList = join(',', keys %$testLineMods);
    my $alreadyFoundMatch = 0;    
    
    if ($referenceCount != $testCount){
	$areSame = 0;
    }
    else {
	my $referenceLineMatches;

	#go through list of modifications for this peptide
	foreach my $testLineMod (keys %$testLineMods){
	    
	    my $foundMatch = 0;
	    my @possibleTestMods = split('\|', $testLineMod);
	    
	    #search each cluster mod to see if there's a match
	    foreach my $referenceLineMod (keys %$referenceLineMods){
		
		my $referenceModHash = &getSplitHash($referenceLineMod);  #allows for multiple lc clustered lines to be searched simultaneously
		foreach my $possibleTestMod(@possibleTestMods){  
		    if ($referenceModHash->{$possibleTestMod}){
			if ($referenceLineMatches->{$referenceLineMod}){
			    #must have 1-1 match. It can be that two PTMs match different possible values of a low-confidence peptide, which is incorrect
			    $alreadyFoundMatch = 1;
			}
			else {
			    $foundMatch = 1;
			    $referenceLineMatches->{$referenceLineMod} = 1;
			}
			last;
		    }
		}
	    }
	    if ($foundMatch == 0){
		$areSame = 0;
		last;
	    }
	}
    }
    return $areSame;
}
	
sub getSplitHash{
    my ($string) = @_;
    my @list = split('\|', $string);
    my $hash;
    foreach my $item (@list){
	$hash->{$item}= $string;
    }
    return $hash;
}


#read file and return $absInfo, hash keyed on each line in the file, with values being information about each line
sub readMsFile{
    my ($fileName, $peptideStartColumn, $accessionColumn, $modsColumn, $outputFh) = @_;
    my $fh = FileHandle->new("<" . $fileName) || die "could not open mass spec file $fileName: $!\n";
    my $counter = 0;
    my $absInfo;
    while (<$fh>){
	chomp;
	my $line = $_;
	if ($counter == 0){  #File column header. Output line exactly, add two columns for results of this script
	    $counter++;
	    $line =~ s/\s+$//;
	    
	    print $outputFh $line ."\tPeptide Number\tGlobal PTM Positions\n";
	    next;
	}
	$counter++;
	my @cols = split('\t', $line);
	my $uniprot = $cols[$accessionColumn];
	my $startPosition = $cols[$peptideStartColumn];
	my $mods = $cols[$modsColumn];

	#hack -- substitute the position in place
	$mods =~ s/@(\d+)/100000000000 + $startPosition -1 + $1/eg;
	$mods =~ s/100000+/@/g;
	
	$mods =~ s/\&(\d+)/200000000000 + $startPosition -1 + $1/eg;
	$mods =~ s/200000+/&/g;	    
    
	$mods =~ s/\|(\d+)/300000000000 + $startPosition -1 + $1/eg;
	$mods =~ s/300000+/|/g;


	
	$absInfo->{$line}->{uniprot} = $uniprot;
	$absInfo->{$line}->{startPosition} = $startPosition;
	$absInfo->{$line}->{firstMod} = 1000000;  #position of first modification; if modifications are low confidence only, save as 1000
	$absInfo->{$line}->{allMods} = $mods;

	my @modTypeList = split(';', $mods);
	my $modCounter = 1;
	foreach my $modtype (@modTypeList){


	    if ($modtype =~ /(Phospho|HexNAc)@(.*)/){

		my $residues = $2;
		if ($residues =~ /(\d+)\=/){  #found exact (HexNAc@5=7)
		    my $exact = $1;
		    $absInfo->{$line}->{mods}->{$exact} = 1;
		    if ($modCounter == 1){
			$absInfo->{$line}->{firstMod} = $exact;
		    }
		}
		elsif ($residues =~ /\&/){
		    $absInfo->{$line}->{multipleLc} = 1;   #found peptide with multiple low confidence annotations, don't consider these
		}
		elsif ($residues =~ /\|/){                #since we didn't find peptide with multiple low confidence, this one must have one low confidence (HexNAc@5|6)
		    $absInfo->{$line}->{mods}->{$residues} = 1;
		}
		else {

		    if ($residues =~ /^\d+$/){   #this happens occasionally with HexNAC@7;Oxidation@5 type format -- < 1% of the time

			$absInfo->{$line}->{multipleLc} = 1;
		    }

		    else {
			die "did not get expected residue format: $residues\n";
		    }
		}
		$modCounter++;
	    }
	}
    }
    return $absInfo;
}



#Output the same file with peptide cluster number and global PTM appended
sub outputNewLines{
    my ($absInfo, $outputFh) = @_;

    foreach my $line (sort {$absInfo->{$a}->{uniprot} cmp $absInfo->{$b}->{uniprot}} keys %$absInfo){
	my $cluster = $absInfo->{$line}->{cluster};

	my $mods = $absInfo->{$line}->{allMods};
	my $startPosition = $absInfo->{$line}->{startPosition};

	$line =~ s/\s+$//;	
	print $outputFh $line . "\t" . $cluster . "\t" . $mods . "\n";
    }
}








