modeller>VimalkumarVelayudhan
No edit summary
(Add bug fixes.)
Line 5: Line 5:


/!\ Please Note: This script is useful only for an alignment containing two sequences (target and a template.). Also complete error checks are not yet implemented.  
/!\ Please Note: This script is useful only for an alignment containing two sequences (target and a template.). Also complete error checks are not yet implemented.  
Updated 11/7/07 by James Haven: add some error checking and extract correct energy values.




<pre><nowiki>
<pre><nowiki>
#!/usr/bin/perl
#!/usr/bin/perl -w
 


$save_aln_sep = $/;
$save_aln_sep = $/;


#Obtain the alignment file and extract the sequences into respective variables using their code


#Open the alignment file
print "Enter Alignment filename (PIR format) [Example: TvLDH-1bdmA.ali] : ";
print "Enter Alignment filename (PIR format) [Example: TvLDH-1bdmA.ali] : ";
$alnfname = <STDIN>;
open(ALNFNAME, $alnfname) or die "Alignment file not found. Please check if file exists or if filename is correct\n";


$alnfname = <STDIN>;


unless(open(ALNFNAME, $alnfname))
{
print "Alignment file not found. Please check if file exists or if the filename is correct\n";
exit;
}


#Get the Target Code
print "Enter code of the Target (Model) sequence [Example: TvLDH]: ";
print "Enter code of the Target (Model) sequence [Example: TvLDH]: ";
$targcode = <STDIN>;
$targcode = <STDIN>;
chomp $targcode;
chomp $targcode;


#Get the Template Code
print "Enter code of the Template structure [Example: 1bdmA]: ";
print "Enter code of the Template structure [Example: 1bdmA]: ";
$tempcode = <STDIN>;  
$tempcode = <STDIN>;  
chomp $tempcode;
chomp $tempcode;


$/ = undef;
$/ = undef;


#Store alignment data
$alignment = <ALNFNAME>;
$alignment = <ALNFNAME>;
close ALNFNAME;


#Here we split the alignment using the >P1 found in all the PIR sequences, so that the two sequences can be obtained in individual #variables. check_aln performs this job. Once obtained, the sequence alone is extracted using the parse_aln subroutine.


#Here we split the alignment using the >P1 found in all the PIR sequences, so that
#the two sequences can be obtained in individual #variables. check_aln performs this job.
#Once obtained, the sequence alone is extracted using the parse_aln subroutine.
@alnseqs = split(/>P1;/, $alignment);
@alnseqs = split(/>P1;/, $alignment);




#extract TARGET sequence
$dtargseq = check_aln(\@alnseqs,\$targcode);
$dtargseq = check_aln(\@alnseqs,\$targcode);
@ftargseq = parse_aln($dtargseq, $targcode);


@ftargseq = parse_aln($dtargseq, $targcode);


#extract TEMPLATE sequence
$dtempseq = check_aln(\@alnseqs,\$tempcode);
$dtempseq = check_aln(\@alnseqs,\$tempcode);
@ftempseq = parse_aln($dtempseq, $tempcode);
@ftempseq = parse_aln($dtempseq, $tempcode);


#The gap positions are identified using this subroutine


#extract TARGET gap positions
@targgaps = calc_gaps(@ftargseq);
@targgaps = calc_gaps(@ftargseq);
print "Target gaps are at @targgaps\n";


print "Target gaps are at @targgaps\n";


#extract TEMPLATE gap positions
@tempgaps = calc_gaps(@ftempseq);
@tempgaps = calc_gaps(@ftempseq);
print "Template gaps are at @tempgaps\n";


print "Template gaps are at @tempgaps\n";


#The energy profiles contain some extra information. only the energy values of the residues are required. The subroutine profiletodat() #performs this function  
#The energy profiles contain some extra information. only the energy values of the
#residues are required. The subroutine profiletodat() performs this function  
$/ = $save_aln_sep;
$/ = $save_aln_sep;


#Check if the profile files are already present. Else ask for a filename.
 
 
#Check if the TARGET profile files are already present. Else ask for a filename.
$targprofile = "$targcode.profile";
$targprofile = "$targcode.profile";
if(!open(TARGPROFILE, $targprofile))
if(!open(TARGPROFILE, $targprofile))
{
{
print "Enter name of Target profile [Example: TvLDH.profile]: ";
        print "Enter name of Target profile [Example: TvLDH.profile]: ";
$targprofile = <STDIN>;
        $targprofile = <STDIN>;
}
}
open(TARGPROFILE, $targprofile) or die "Can't open Target profile: $targprofile\n";
@targprofile = <TARGPROFILE>;
close TARGPROFILE;


open(TARGPROFILE, $targprofile);
@targprofile = <TARGPROFILE>;




#Check if the TEMPLATE profile files are already present. Else ask for a filename.
$tempprofile = "$tempcode.profile";
$tempprofile = "$tempcode.profile";
if(!open(TEMPPROFILE, $tempprofile))
if(!open(TEMPPROFILE, $tempprofile))
{
{
print "Enter name of Template profile [Example: 1bdmA.profile]: ";
        print "Enter name of Template profile [Example: 1bdmA.profile]: ";
$tempprofile = <STDIN>;
        $tempprofile = <STDIN>;
}
}
open(TEMPPROFILE, $tempprofile) or die "Can't open Template profile: $tempprofile\n";
@tempprofile = <TEMPPROFILE>;
close TEMPPROFILE;


open(TEMPPROFILE, $tempprofile);


@tempprofile = <TEMPPROFILE>;
#Parse TARGET Profile file
@targdat = profiletodat(@targprofile);




@targdat = profiletodat(@targprofile);


#Parse TEMPLATE Profile file
@tempdat = profiletodat(@tempprofile);
@tempdat = profiletodat(@tempprofile);


Line 94: Line 119:


#The gaps in the sequences do not contain energy values. Here we modify the dat file(previous step) to  
#The gaps in the sequences do not contain energy values. Here we modify the dat file(previous step) to  
@mtargdat = modify_dat(\@targdat, \@targgaps);
#Correct TARGET energy data for gaps (insert ? in gapped positions) => these sites won't be plotted
@mtargdat_old = modify_dat(\@targdat, \@targgaps);
for (my $i=0; $i<scalar(@mtargdat_old)-1; $i++) {
push @mtargdat, $i."\t".$mtargdat_old[$i]."\n";
}
 
 
 
#Correct TEMPLATE energy data for gaps (insert ? in gapped positions) => these sites won't be plotted
@mtempdat_old = modify_dat(\@tempdat, \@tempgaps);
for (my $i=0; $i<scalar(@mtempdat_old)-1; $i++) {
push @mtempdat, $i."\t".$mtempdat_old[$i]."\n";
}
 


@mtempdat = modify_dat(\@tempdat, \@tempgaps);


##GnuPlot does not accept arrays. Hence we save the two dat files for plotting
##GnuPlot does not accept arrays. Hence we save the two dat files for plotting
#Write TARGET dat (see previous block) to file
$gtargdat = "$targcode.dat";
$gtargdat = "$targcode.dat";
open(GTARGDAT, ">$gtargdat");
print GTARGDAT @mtargdat;


open(GTARGDAT, ">$gtargdat");


print GTARGDAT @mtargdat;


#Write TEMPLATE dat (see previous block) to file
$gtempdat = "$tempcode.dat";
$gtempdat = "$tempcode.dat";
open(GTEMPDAT, ">$gtempdat");
open(GTEMPDAT, ">$gtempdat");
print GTEMPDAT "@mtempdat";
print GTEMPDAT "@mtempdat";


#Finally plot these values. plot_dope calls gnuplot and plots the data.




#Finally plot these values. plot_dope calls gnuplot and plots the data.
$dopeout = "dope.png";
$dopeout = "dope.png";
open(DOPEOUT, $dopeout);
open(DOPEOUT, $dopeout);
if( -e DOPEOUT)
if( -e DOPEOUT)
{
{
print "$dopeout exists\n";
        print "$dopeout exists\n";
$dopeout = check_dope();
        $dopeout = check_dope();
}
}
plot_dope($gtargdat, $gtempdat, $dopeout);


plot_dope($gtargdat, $gtempdat, $dopeout);




Line 134: Line 167:
sub check_aln
sub check_aln
{
{
my($alnseqs, $code)= @_;
        my($alnseqs, $code)= @_;
       
foreach $seq(@$alnseqs)
        foreach $seq(@$alnseqs)
{
        {
if($seq =~ /$$code/)
                if($seq =~ /$$code/)
{
                {
return $seq;
                        $return_seq = $seq;
}
                }
}
        }
if($return_seq eq '') {die "Can't find code: $$code in alignment; subroutine check_aln failed!\n";}
return $return_seq;
}
}




#Parse alignment files to obatain sequence alone
 
#Parse alignment files to obtain sequence alone
sub parse_aln
sub parse_aln
{
{
       
my($seq, $code) = @_;
        my($seq, $code) = @_;
 
        my @seq = $seq;
        my @parseq = ();


my @seq = $seq;
        foreach $line(@seq)
my @parseq = ();
        {
                $line =~ s/\n//;


foreach $line(@seq)
                if($line =~ /^$code/)
{
                {
$line =~ s/\n//;
                        $line =~ s/$code.*//;
                        $line =~ s/(sequence|structureX)[:](.*)*//;
                        $line =~ s/\n//s;
                       
                        push(@parseq, $line);
                }
        }


if($line =~ /^$code/)
if (scalar(@parseq) == 0) { die "Check your alignment (PIR) file; subroutine parse_aln failed!\n"}
{
        return @parseq;
$line =~ s/$code.*//;
$line =~ s/(sequence|structureX)[:](.*)*//;
$line =~ s/\n//s;
push(@parseq, $line);
}
}
return @parseq;
}
}


Line 176: Line 214:
sub calc_gaps
sub calc_gaps
{
{
my ($sequence) = @_;
        my ($sequence) = @_;
$sequence =~ s/\n//;
        $sequence =~ s/\n//;
$lenseq = length($sequence);
        my $position = 0;
# print "Length of sequence is $lenseq\n";
        my $gap = 0;
my $position = 0;
        my @gaps = ();
my $gap = 0;
my @gaps = ();


for($position=0;$position < length($sequence);$position++)
        for($position=0;$position <= length($sequence)-1;$position++)


{
        {
$gap = substr($sequence, $position-1, 1);
                $gap = substr($sequence, $position, 1);


if($gap eq '-')
                if($gap eq '-')
{
                {
push(@gaps, $position);
                        push(@gaps, $position+1);
}
                }
}
        }
return @gaps;
if(scalar(@gaps) == 0) { die "subroutine calc_gaps failed\n"}
        return @gaps;


}
}
Line 201: Line 238:
sub profiletodat
sub profiletodat
{
{
my(@profile) = @_;
        my(@profile) = @_;
my $energy='';
        my $energy='';
my @modprofile;
        my @modprofile;


foreach my $line(@profile)
        foreach my $line(@profile)
{
        {
               
if($line =~ /^\s*#/)
                if($line =~ /^\s*\#/)
{
                {
next;
                        next;
}
                }
else
                else
{
                {                      
$energy = substr($line, 436, 10);
@a = split /\s+/, $line;
push(@modprofile,$energy);
$energy = $a[42];                       
}
#$energy = substr($line, 436, 10);
}
#print $energy,"\n";
                        push(@modprofile,$energy);
                }
        }
if (scalar(@modprofile) == 0) { die "Something is wrong with an Energy Profile file; subroutine
                                            profiletodat failed\n"}   
shift @modprofile;
return @modprofile;
}      


return @modprofile;
}




Line 227: Line 270:
sub modify_dat
sub modify_dat
{
{
my($dat, $gaps) = @_;
        my($dat, $gaps) = @_;


foreach $position(@$gaps)
        foreach $position(@$gaps)
{
                {
for($datposition=0;$datposition < scalar @$dat; ++$datposition)
                        for($datposition=0;$datposition < scalar @$dat; ++$datposition)
{
                        {
if($datposition == $position)
                                if($datposition == $position)
{
                                {
splice(@$dat, $datposition, 0, "?\n");
                                        splice(@$dat, $datposition, 0, "?");
}
                                }
}
                        }
}
                }


return @$dat;
        return @$dat;
}
}




Line 248: Line 292:
{
{


print "Should I overwrite (y/n): ";
        print "Should I overwrite (y/n): ";
$check = <STDIN>;
        $check = <STDIN>;
chomp $check;
        chomp $check;


if($check eq 'y')
        if($check =~ /y/i)
{
        {              
exit;
return $dopeout;
        }
}
        elsif($check =~ /n/i)
elsif($check eq 'n')
        {
{
                print "Enter new filename : ";
print "Enter new filename : ";
                $dopeout=<STDIN>;
$dopeout=<STDIN>;
                chomp $dopeout;
chomp $dopeout;
                return $dopeout;
        }
}
        else
else
        {
{
                print "Please enter y/n\n";
print "Please enter y/n";
$dopeout = check_dope();
exit;
        }
}
       
}
return $dopeout;


}




Line 278: Line 320:
sub plot_dope
sub plot_dope
{
{
my($targdat, $tempdat, $dopeout) = @_;
        my($targdat, $tempdat, $dopeout) = @_;
       
my $gnuplot = "/usr/bin/gnuplot";
        my $gnuplot = "/usr/bin/gnuplot";
       
my $terminal = "set terminal png\;\n";
        my $terminal = "set terminal png\;\n";
my $xlabel = "set xlabel \"Residue Index\"\;\n";
        my $xlabel = "set xlabel \"Residue Index\"\;\n";
my $ylabel = "set ylabel \"DOPE Score\"\;\n";
        my $ylabel = "set ylabel \"DOPE Score\"\;\n";
my $missing = "set datafile missing \"?\"\;\n";
        my $missing = "set datafile missing \"?\"\;\n";
my $output = "set output \"$dopeout\"\;\n";
        my $output = "set output \"$dopeout\"\;\n";
       
my $plot = "plot \"$targdat\" with lines, \"$tempdat\" with lines\;\n";
        my $plot = "plot \"$targdat\" using 1:2 with lines, \"$tempdat\" using 1:2 with lines\;\n";


my @plotset = ($terminal,$xlabel,$ylabel,$missing,$output,$plot);
        my @plotset = ($terminal,$xlabel,$ylabel,$missing,$output,$plot);
# print "@plotset\n";
#       print "@plotset\n";


my $plotscript = "plotscript.txt";
        my $plotscript = "plotscript.txt";
       
open(PLOTSCRIPT, ">$plotscript");
        open(PLOTSCRIPT, ">$plotscript") or die "Can't open file $plotscript for writing\n";
       
print PLOTSCRIPT "@plotset";
        print PLOTSCRIPT @plotset;


close (PLOTSCRIPT);
        close (PLOTSCRIPT);
       
my $finalscript = "plotscript.txt";
        my $finalscript = "plotscript.txt";
       
# open(FINALSCRIPT, $finalscript);
#       open(FINALSCRIPT, $finalscript);
#
#      
# @gnuplotscript = <FINALSCRIPT>;
#       @gnuplotscript = <FINALSCRIPT>;
       
`$gnuplot $finalscript`;
        `$gnuplot $finalscript`;


print "The DOPE plot is in $dopeout\n";
        print "The DOPE plot is in $dopeout\n";
}
}
</nowiki></pre>
</nowiki></pre>





Revision as of 17:55, 7 November 2007

Plotdope is a perl script to plot the energy values obtained as output of the model evaluation step of MODELLER.

Often, there are gaps in the alignment of the target and template. In the model evaluation step, the energy values at these positions are not computed. To compare the energy profiles of two structures, the positions of these residues should match. This script obtains the alignment and the energy profiles as input and then introduces gaps accordingly in the parsed profile files and then plots the resulting data using Gnuplot

/!\ Please Note: This script is useful only for an alignment containing two sequences (target and a template.). Also complete error checks are not yet implemented.

Updated 11/7/07 by James Haven: add some error checking and extract correct energy values.


#!/usr/bin/perl -w


$save_aln_sep = $/;


#Open the alignment file 
print "Enter Alignment filename (PIR format) [Example: TvLDH-1bdmA.ali] : ";
$alnfname = <STDIN>;
open(ALNFNAME, $alnfname) or die "Alignment file not found. Please check if file exists or if filename is correct\n";



#Get the Target Code
print "Enter code of the Target (Model) sequence [Example: TvLDH]: ";
$targcode = <STDIN>;
chomp $targcode;



#Get the Template Code
print "Enter code of the Template structure [Example: 1bdmA]: ";
$tempcode = <STDIN>; 
chomp $tempcode;


$/ = undef;


#Store alignment data
$alignment = <ALNFNAME>;
close ALNFNAME;



#Here we split the alignment using the >P1 found in all the PIR sequences, so that
#the two sequences can be obtained in individual #variables. check_aln performs this job.
#Once obtained, the sequence alone is extracted using the parse_aln subroutine.
@alnseqs = split(/>P1;/, $alignment);



#extract TARGET sequence
$dtargseq = check_aln(\@alnseqs,\$targcode);
@ftargseq = parse_aln($dtargseq, $targcode);



#extract TEMPLATE sequence
$dtempseq = check_aln(\@alnseqs,\$tempcode);
@ftempseq = parse_aln($dtempseq, $tempcode);



#extract TARGET gap positions
@targgaps = calc_gaps(@ftargseq);
print "Target gaps are at @targgaps\n";



#extract TEMPLATE gap positions
@tempgaps = calc_gaps(@ftempseq);
print "Template gaps are at @tempgaps\n";



#The energy profiles contain some extra information. only the energy values of the
#residues are required. The subroutine profiletodat() performs this function 
$/ = $save_aln_sep;



#Check if the TARGET profile files are already present. Else ask for a filename.
$targprofile = "$targcode.profile";
if(!open(TARGPROFILE, $targprofile))
{
        print "Enter name of Target profile [Example: TvLDH.profile]: ";
        $targprofile = <STDIN>;
}
open(TARGPROFILE, $targprofile) or die "Can't open Target profile: $targprofile\n";
@targprofile = <TARGPROFILE>;
close TARGPROFILE;



#Check if the TEMPLATE profile files are already present. Else ask for a filename.
$tempprofile = "$tempcode.profile";
if(!open(TEMPPROFILE, $tempprofile))
{
        print "Enter name of Template profile [Example: 1bdmA.profile]: ";
        $tempprofile = <STDIN>;
}
open(TEMPPROFILE, $tempprofile) or die "Can't open Template profile: $tempprofile\n";
@tempprofile = <TEMPPROFILE>;
close TEMPPROFILE;



#Parse TARGET Profile file
@targdat = profiletodat(@targprofile);



#Parse TEMPLATE Profile file
@tempdat = profiletodat(@tempprofile);



#The gaps in the sequences do not contain energy values. Here we modify the dat file(previous step) to 
#Correct TARGET energy data for gaps (insert ? in gapped positions) => these sites won't be plotted
@mtargdat_old = modify_dat(\@targdat, \@targgaps);
for (my $i=0; $i<scalar(@mtargdat_old)-1; $i++) {
	push @mtargdat, $i."\t".$mtargdat_old[$i]."\n";
}



#Correct TEMPLATE energy data for gaps (insert ? in gapped positions) => these sites won't be plotted
@mtempdat_old = modify_dat(\@tempdat, \@tempgaps);
for (my $i=0; $i<scalar(@mtempdat_old)-1; $i++) {
	push @mtempdat, $i."\t".$mtempdat_old[$i]."\n";
}



##GnuPlot does not accept arrays. Hence we save the two dat files for plotting
#Write TARGET dat (see previous block) to file
$gtargdat = "$targcode.dat";
open(GTARGDAT, ">$gtargdat");
print GTARGDAT @mtargdat;



#Write TEMPLATE dat (see previous block) to file
$gtempdat = "$tempcode.dat";
open(GTEMPDAT, ">$gtempdat");
print GTEMPDAT "@mtempdat";



#Finally plot these values. plot_dope calls gnuplot and plots the data. 
$dopeout = "dope.png";
open(DOPEOUT, $dopeout);
if( -e DOPEOUT)
{
        print "$dopeout exists\n";
        $dopeout = check_dope();
}
plot_dope($gtargdat, $gtempdat, $dopeout);




###############################SUBROUTINES###################
#check alignment and send sequences for parsing
sub check_aln
{
        my($alnseqs, $code)= @_;
        
        foreach $seq(@$alnseqs)
        {
                if($seq =~ /$$code/)
                {
                        $return_seq = $seq;
                }
        }
	if($return_seq eq '') {die "Can't find code: $$code in alignment; subroutine check_aln failed!\n";}
	return $return_seq;
}



#Parse alignment files to obtain sequence alone
sub parse_aln
{
        
        my($seq, $code) = @_;

        my @seq = $seq;
        my @parseq = ();

        foreach $line(@seq)
        {
                $line =~ s/\n//;

                if($line =~ /^$code/)
                {
                        $line =~ s/$code.*//;
                        $line =~ s/(sequence|structureX)[:](.*)*//;
                        $line =~ s/\n//s;
                        
                        push(@parseq, $line);
                }
        }

	if (scalar(@parseq) == 0) { die "Check your alignment (PIR) file; subroutine parse_aln failed!\n"}
        return @parseq;
}



#calculates gaps given the sequence alone 
sub calc_gaps
{
        my ($sequence) = @_;
        $sequence =~ s/\n//;
        my $position = 0;
        my $gap = 0;
        my @gaps = ();

        for($position=0;$position <= length($sequence)-1;$position++)

        {
                $gap = substr($sequence, $position, 1);

                if($gap eq '-')
                {
                        push(@gaps, $position+1);
                }
        }
	if(scalar(@gaps) == 0) { die "subroutine calc_gaps failed\n"}
        return @gaps;

}

#To convert the draft profile to working dat file
sub profiletodat
{
        my(@profile) = @_;
        my $energy='';
        my @modprofile;

        foreach my $line(@profile)
        {
                
                if($line =~ /^\s*\#/)
                {
                        next;
                }
                else
                {                       
			@a = split /\s+/, $line;
			$energy = $a[42];                        
			#$energy = substr($line, 436, 10);
			#print $energy,"\n";
                        push(@modprofile,$energy);
                }
        }
	if (scalar(@modprofile) == 0) { die "Something is wrong with an Energy Profile file; subroutine
                                             profiletodat failed\n"}     
	shift @modprofile;
	return @modprofile;
}       



#To modify the dat files. Receives reference dat file and gap file containg positions of gaps

sub modify_dat
{
        my($dat, $gaps) = @_;

        foreach $position(@$gaps)
                {
                        for($datposition=0;$datposition < scalar @$dat; ++$datposition)
                        {
                                if($datposition == $position)
                                {
                                        splice(@$dat, $datposition, 0, "?");
                                }
                        }
                }

        return @$dat;
}



#Check for output file dope.png. If it exists, ask for a new filename
sub check_dope
{

        print "Should I overwrite (y/n): ";
        $check = <STDIN>;
        chomp $check;

        if($check =~ /y/i)
        {               
		return $dopeout;
        }
        elsif($check =~ /n/i)
        {
                print "Enter new filename : ";
                $dopeout=<STDIN>;
                chomp $dopeout;
                return $dopeout;
        }
        else
        {
                print "Please enter y/n\n";
		$dopeout = check_dope();
        }
        
}



#Plotting the values using gnuplot
sub plot_dope
{
        my($targdat, $tempdat, $dopeout) = @_;
        
        my $gnuplot = "/usr/bin/gnuplot";
        
        my $terminal = "set terminal png\;\n";
        my $xlabel = "set xlabel \"Residue Index\"\;\n";
        my $ylabel = "set ylabel \"DOPE Score\"\;\n";
        my $missing = "set datafile missing \"?\"\;\n";
        my $output = "set output \"$dopeout\"\;\n";
        
        my $plot = "plot \"$targdat\" using 1:2 with lines, \"$tempdat\" using 1:2 with lines\;\n";

        my @plotset = ($terminal,$xlabel,$ylabel,$missing,$output,$plot);
#       print "@plotset\n";

        my $plotscript = "plotscript.txt";
        
        open(PLOTSCRIPT, ">$plotscript") or die "Can't open file $plotscript for writing\n";
        
        print PLOTSCRIPT @plotset;

        close (PLOTSCRIPT);
        
        my $finalscript = "plotscript.txt";
        
#       open(FINALSCRIPT, $finalscript);
#       
#       @gnuplotscript = <FINALSCRIPT>;
        
        `$gnuplot $finalscript`;

        print "The DOPE plot is in $dopeout\n";
}