#!/usr/bin/perl

use strict;
use FileHandle;


#makes map of genbank identifier to gi number by reading fasta file headers.
#(not sure of differnece but gi is a number and gb is of format "ABQ15152)

#fasta file can be obtained by downloading all schisto proteins from Genbank Protein (save as file in fasta format)
#current fasta file is ../data/schistoProteome.fasta

my $inputFile = $ARGV[0];
my $outputFile = $ARGV[1];

unless ($inputFile && $outputFile){
    print STDERR "usage: perl makeGbToGiMap.pl <inputFile> <outputFile>\n";
    exit(1);
}



my $fh = FileHandle->new("<" . $inputFile) || die "could not open $inputFile\n";
my $outFh = FileHandle->new(">" . $outputFile) || die "could not open $outputFile for writing\n";


while (<$fh>){
    chomp;
    my $line = $_;
    if ($line =~ /\>gi/){
	my @cols = split('\|', $line);
	my $gi = $cols[1];
	my $secondaryId = $cols[2];
	if ($secondaryId eq "gb"){
	    my $firstEntry = $cols[3];
	    if ($firstEntry =~ /(.*)\./){
		$firstEntry = $1;
	    }
	    print $outFh "$firstEntry\t$gi\n";
	    my $secondEntry = $cols[4];
	    if ($secondEntry =~ /^(\w+)/){   #if no second entry, there is a space after the next '|'
		$secondEntry = $1;  
		if ($secondEntry =~ /(.*)\_/){
		    $secondEntry = $1;
		}
		print $outFh "$secondEntry\t$gi\n";
	    }
	}
    }
}
