Plotdope: Difference between revisions
No edit summary |
modeller>VimalkumarVelayudhan No edit summary |
||
Line 5: | Line 5: | ||
/!\ Please Note: This script is useful only for an alignment containing two sequences (target and a template.). Also complete error checks are not yet implemented. | /!\ Please Note: This script is useful only for an alignment containing two sequences (target and a template.). Also complete error checks are not yet implemented. | ||
<pre><nowiki> | <pre><nowiki> | ||
#!/usr/bin/perl | #!/usr/bin/perl | ||
$save_aln_sep = $/; | $save_aln_sep = $/; | ||
#Obtain the alignment file and extract the sequences into respective variables using their code | |||
print "Enter Alignment filename (PIR format) [Example: TvLDH-1bdmA.ali] : "; | print "Enter Alignment filename (PIR format) [Example: TvLDH-1bdmA.ali] : "; | ||
$alnfname = <STDIN>; | $alnfname = <STDIN>; | ||
unless(open(ALNFNAME, $alnfname)) | |||
{ | |||
print "Alignment file not found. Please check if file exists or if the filename is correct\n"; | |||
exit; | |||
} | |||
print "Enter code of the Target (Model) sequence [Example: TvLDH]: "; | print "Enter code of the Target (Model) sequence [Example: TvLDH]: "; | ||
$targcode = <STDIN>; | $targcode = <STDIN>; | ||
chomp $targcode; | chomp $targcode; | ||
print "Enter code of the Template structure [Example: 1bdmA]: "; | print "Enter code of the Template structure [Example: 1bdmA]: "; | ||
$tempcode = <STDIN>; | $tempcode = <STDIN>; | ||
chomp $tempcode; | chomp $tempcode; | ||
$/ = undef; | $/ = undef; | ||
$alignment = <ALNFNAME>; | $alignment = <ALNFNAME>; | ||
#Here we split the alignment using the >P1 found in all the PIR sequences, so that the two sequences can be obtained in individual #variables. check_aln performs this job. Once obtained, the sequence alone is extracted using the parse_aln subroutine. | |||
@alnseqs = split(/>P1;/, $alignment); | @alnseqs = split(/>P1;/, $alignment); | ||
$dtargseq = check_aln(\@alnseqs,\$targcode); | |||
@ftargseq = parse_aln($dtargseq, $targcode); | @ftargseq = parse_aln($dtargseq, $targcode); | ||
$dtempseq = check_aln(\@alnseqs,\$tempcode); | $dtempseq = check_aln(\@alnseqs,\$tempcode); | ||
@ftempseq = parse_aln($dtempseq, $tempcode); | @ftempseq = parse_aln($dtempseq, $tempcode); | ||
#The gap positions are identified using this subroutine | |||
@targgaps = calc_gaps(@ftargseq); | |||
print "Target gaps are at @targgaps\n"; | print "Target gaps are at @targgaps\n"; | ||
@tempgaps = calc_gaps(@ftempseq); | |||
print "Template gaps are at @tempgaps\n"; | print "Template gaps are at @tempgaps\n"; | ||
#The energy profiles contain some extra information. only the energy values of the residues are required. The subroutine profiletodat() #performs this function | |||
#The energy profiles contain some extra information. only the energy values of the | |||
$/ = $save_aln_sep; | $/ = $save_aln_sep; | ||
#Check if the profile files are already present. Else ask for a filename. | |||
$targprofile = "$targcode.profile"; | |||
if(!open(TARGPROFILE, $targprofile)) | if(!open(TARGPROFILE, $targprofile)) | ||
{ | { | ||
print "Enter name of Target profile [Example: TvLDH.profile]: "; | |||
$targprofile = <STDIN>; | |||
} | } | ||
open(TARGPROFILE, $targprofile) | |||
open(TARGPROFILE, $targprofile); | |||
@targprofile = <TARGPROFILE>; | @targprofile = <TARGPROFILE>; | ||
$tempprofile = "$tempcode.profile"; | |||
if(!open(TEMPPROFILE, $tempprofile)) | if(!open(TEMPPROFILE, $tempprofile)) | ||
{ | { | ||
print "Enter name of Template profile [Example: 1bdmA.profile]: "; | |||
$tempprofile = <STDIN>; | |||
} | } | ||
open(TEMPPROFILE, $tempprofile) | |||
open(TEMPPROFILE, $tempprofile); | |||
@tempprofile = <TEMPPROFILE>; | @tempprofile = <TEMPPROFILE>; | ||
@targdat = profiletodat(@targprofile); | @targdat = profiletodat(@targprofile); | ||
@tempdat = profiletodat(@tempprofile); | @tempdat = profiletodat(@tempprofile); | ||
Line 119: | Line 94: | ||
#The gaps in the sequences do not contain energy values. Here we modify the dat file(previous step) to | #The gaps in the sequences do not contain energy values. Here we modify the dat file(previous step) to | ||
@mtargdat = modify_dat(\@targdat, \@targgaps); | |||
@ | |||
@mtempdat = modify_dat(\@tempdat, \@tempgaps); | |||
##GnuPlot does not accept arrays. Hence we save the two dat files for plotting | ##GnuPlot does not accept arrays. Hence we save the two dat files for plotting | ||
$gtargdat = "$targcode.dat"; | $gtargdat = "$targcode.dat"; | ||
open(GTARGDAT, ">$gtargdat"); | open(GTARGDAT, ">$gtargdat"); | ||
print GTARGDAT @mtargdat; | print GTARGDAT @mtargdat; | ||
$gtempdat = "$tempcode.dat"; | |||
open(GTEMPDAT, ">$gtempdat"); | |||
print GTEMPDAT "@mtempdat"; | print GTEMPDAT "@mtempdat"; | ||
#Finally plot these values. plot_dope calls gnuplot and plots the data. | |||
$dopeout = "dope.png"; | $dopeout = "dope.png"; | ||
open(DOPEOUT, $dopeout); | open(DOPEOUT, $dopeout); | ||
if( -e DOPEOUT) | if( -e DOPEOUT) | ||
{ | { | ||
print "$dopeout exists\n"; | |||
$dopeout = check_dope(); | |||
} | } | ||
plot_dope($gtargdat, $gtempdat, $dopeout); | plot_dope($gtargdat, $gtempdat, $dopeout); | ||
Line 167: | Line 134: | ||
sub check_aln | sub check_aln | ||
{ | { | ||
my($alnseqs, $code)= @_; | |||
foreach $seq(@$alnseqs) | |||
{ | |||
if($seq =~ /$$code/) | |||
{ | |||
return $seq; | |||
} | |||
} | |||
} | } | ||
#Parse alignment files to obatain sequence alone | |||
#Parse alignment files to | |||
sub parse_aln | sub parse_aln | ||
{ | { | ||
my($seq, $code) = @_; | |||
my @seq = $seq; | |||
my @parseq = (); | |||
foreach $line(@seq) | |||
{ | |||
$line =~ s/\n//; | |||
if($line =~ /^$code/) | |||
{ | |||
$line =~ s/$code.*//; | |||
$line =~ s/(sequence|structureX)[:](.*)*//; | |||
$line =~ s/\n//s; | |||
push(@parseq, $line); | |||
} | |||
} | |||
return @parseq; | |||
} | } | ||
Line 214: | Line 176: | ||
sub calc_gaps | sub calc_gaps | ||
{ | { | ||
my ($sequence) = @_; | |||
$sequence =~ s/\n//; | |||
$lenseq = length($sequence); | |||
# print "Length of sequence is $lenseq\n"; | |||
my $position = 0; | |||
my $gap = 0; | |||
my @gaps = (); | |||
for($position=0;$position < length($sequence);$position++) | |||
{ | |||
$gap = substr($sequence, $position-1, 1); | |||
if($gap eq '-') | |||
{ | |||
push(@gaps, $position); | |||
} | |||
} | |||
return @gaps; | |||
} | } | ||
Line 238: | Line 201: | ||
sub profiletodat | sub profiletodat | ||
{ | { | ||
my(@profile) = @_; | |||
my $energy=''; | |||
my @modprofile; | |||
foreach my $line(@profile) | |||
{ | |||
if($line =~ /^\s*#/) | |||
{ | |||
next; | |||
} | |||
else | |||
{ | |||
$energy = substr($line, 436, 10); | |||
push(@modprofile,$energy); | |||
} | |||
} | |||
return @modprofile; | return @modprofile; | ||
} | } | ||
Line 270: | Line 227: | ||
sub modify_dat | sub modify_dat | ||
{ | { | ||
my($dat, $gaps) = @_; | |||
foreach $position(@$gaps) | |||
{ | |||
for($datposition=0;$datposition < scalar @$dat; ++$datposition) | |||
{ | |||
if($datposition == $position) | |||
{ | |||
splice(@$dat, $datposition, 0, "?\n"); | |||
} | |||
} | |||
} | |||
return @$dat; | |||
} | } | ||
Line 292: | Line 248: | ||
{ | { | ||
print "Should I overwrite (y/n): "; | |||
$check = <STDIN>; | |||
chomp $check; | |||
if($check eq 'y') | |||
{ | |||
exit; | |||
} | |||
elsif($check eq 'n') | |||
{ | |||
print "Enter new filename : "; | |||
$dopeout=<STDIN>; | |||
chomp $dopeout; | |||
} | |||
else | |||
{ | |||
print "Please enter y/n"; | |||
exit; | |||
} | |||
return $dopeout; | |||
} | } | ||
Line 320: | Line 278: | ||
sub plot_dope | sub plot_dope | ||
{ | { | ||
my($targdat, $tempdat, $dopeout) = @_; | |||
my $gnuplot = "/usr/bin/gnuplot"; | |||
my $terminal = "set terminal png\;\n"; | |||
my $xlabel = "set xlabel \"Residue Index\"\;\n"; | |||
my $ylabel = "set ylabel \"DOPE Score\"\;\n"; | |||
my $missing = "set datafile missing \"?\"\;\n"; | |||
my $output = "set output \"$dopeout\"\;\n"; | |||
my $plot = "plot \"$targdat\" with lines, \"$tempdat\" with lines\;\n"; | |||
my @plotset = ($terminal,$xlabel,$ylabel,$missing,$output,$plot); | |||
# | # print "@plotset\n"; | ||
my $plotscript = "plotscript.txt"; | |||
open(PLOTSCRIPT, ">$plotscript"); | |||
print PLOTSCRIPT "@plotset"; | |||
close (PLOTSCRIPT); | |||
my $finalscript = "plotscript.txt"; | |||
# | # open(FINALSCRIPT, $finalscript); | ||
# | # | ||
# | # @gnuplotscript = <FINALSCRIPT>; | ||
`$gnuplot $finalscript`; | |||
print "The DOPE plot is in $dopeout\n"; | |||
} | } | ||
</nowiki></pre> | </nowiki></pre> | ||
Revision as of 13:32, 16 March 2006
Plotdope is a perl script to plot the energy values obtained as output of the model evaluation step of MODELLER.
Often, there are gaps in the alignment of the target and template. In the model evaluation step, the energy values at these positions are not computed. To compare the energy profiles of two structures, the positions of these residues should match. This script obtains the alignment and the energy profiles as input and then introduces gaps accordingly in the parsed profile files and then plots the resulting data using Gnuplot
/!\ Please Note: This script is useful only for an alignment containing two sequences (target and a template.). Also complete error checks are not yet implemented.
#!/usr/bin/perl $save_aln_sep = $/; #Obtain the alignment file and extract the sequences into respective variables using their code print "Enter Alignment filename (PIR format) [Example: TvLDH-1bdmA.ali] : "; $alnfname = <STDIN>; unless(open(ALNFNAME, $alnfname)) { print "Alignment file not found. Please check if file exists or if the filename is correct\n"; exit; } print "Enter code of the Target (Model) sequence [Example: TvLDH]: "; $targcode = <STDIN>; chomp $targcode; print "Enter code of the Template structure [Example: 1bdmA]: "; $tempcode = <STDIN>; chomp $tempcode; $/ = undef; $alignment = <ALNFNAME>; #Here we split the alignment using the >P1 found in all the PIR sequences, so that the two sequences can be obtained in individual #variables. check_aln performs this job. Once obtained, the sequence alone is extracted using the parse_aln subroutine. @alnseqs = split(/>P1;/, $alignment); $dtargseq = check_aln(\@alnseqs,\$targcode); @ftargseq = parse_aln($dtargseq, $targcode); $dtempseq = check_aln(\@alnseqs,\$tempcode); @ftempseq = parse_aln($dtempseq, $tempcode); #The gap positions are identified using this subroutine @targgaps = calc_gaps(@ftargseq); print "Target gaps are at @targgaps\n"; @tempgaps = calc_gaps(@ftempseq); print "Template gaps are at @tempgaps\n"; #The energy profiles contain some extra information. only the energy values of the residues are required. The subroutine profiletodat() #performs this function $/ = $save_aln_sep; #Check if the profile files are already present. Else ask for a filename. $targprofile = "$targcode.profile"; if(!open(TARGPROFILE, $targprofile)) { print "Enter name of Target profile [Example: TvLDH.profile]: "; $targprofile = <STDIN>; } open(TARGPROFILE, $targprofile); @targprofile = <TARGPROFILE>; $tempprofile = "$tempcode.profile"; if(!open(TEMPPROFILE, $tempprofile)) { print "Enter name of Template profile [Example: 1bdmA.profile]: "; $tempprofile = <STDIN>; } open(TEMPPROFILE, $tempprofile); @tempprofile = <TEMPPROFILE>; @targdat = profiletodat(@targprofile); @tempdat = profiletodat(@tempprofile); #The gaps in the sequences do not contain energy values. Here we modify the dat file(previous step) to @mtargdat = modify_dat(\@targdat, \@targgaps); @mtempdat = modify_dat(\@tempdat, \@tempgaps); ##GnuPlot does not accept arrays. Hence we save the two dat files for plotting $gtargdat = "$targcode.dat"; open(GTARGDAT, ">$gtargdat"); print GTARGDAT @mtargdat; $gtempdat = "$tempcode.dat"; open(GTEMPDAT, ">$gtempdat"); print GTEMPDAT "@mtempdat"; #Finally plot these values. plot_dope calls gnuplot and plots the data. $dopeout = "dope.png"; open(DOPEOUT, $dopeout); if( -e DOPEOUT) { print "$dopeout exists\n"; $dopeout = check_dope(); } plot_dope($gtargdat, $gtempdat, $dopeout); ###############################SUBROUTINES################### #check alignment and send sequences for parsing sub check_aln { my($alnseqs, $code)= @_; foreach $seq(@$alnseqs) { if($seq =~ /$$code/) { return $seq; } } } #Parse alignment files to obatain sequence alone sub parse_aln { my($seq, $code) = @_; my @seq = $seq; my @parseq = (); foreach $line(@seq) { $line =~ s/\n//; if($line =~ /^$code/) { $line =~ s/$code.*//; $line =~ s/(sequence|structureX)[:](.*)*//; $line =~ s/\n//s; push(@parseq, $line); } } return @parseq; } #calculates gaps given the sequence alone sub calc_gaps { my ($sequence) = @_; $sequence =~ s/\n//; $lenseq = length($sequence); # print "Length of sequence is $lenseq\n"; my $position = 0; my $gap = 0; my @gaps = (); for($position=0;$position < length($sequence);$position++) { $gap = substr($sequence, $position-1, 1); if($gap eq '-') { push(@gaps, $position); } } return @gaps; } #To convert the draft profile to working dat file sub profiletodat { my(@profile) = @_; my $energy=''; my @modprofile; foreach my $line(@profile) { if($line =~ /^\s*#/) { next; } else { $energy = substr($line, 436, 10); push(@modprofile,$energy); } } return @modprofile; } #To modify the dat files. Receives reference dat file and gap file containg positions of gaps sub modify_dat { my($dat, $gaps) = @_; foreach $position(@$gaps) { for($datposition=0;$datposition < scalar @$dat; ++$datposition) { if($datposition == $position) { splice(@$dat, $datposition, 0, "?\n"); } } } return @$dat; } #Check for output file dope.png. If it exists, ask for a new filename sub check_dope { print "Should I overwrite (y/n): "; $check = <STDIN>; chomp $check; if($check eq 'y') { exit; } elsif($check eq 'n') { print "Enter new filename : "; $dopeout=<STDIN>; chomp $dopeout; } else { print "Please enter y/n"; exit; } return $dopeout; } #Plotting the values using gnuplot sub plot_dope { my($targdat, $tempdat, $dopeout) = @_; my $gnuplot = "/usr/bin/gnuplot"; my $terminal = "set terminal png\;\n"; my $xlabel = "set xlabel \"Residue Index\"\;\n"; my $ylabel = "set ylabel \"DOPE Score\"\;\n"; my $missing = "set datafile missing \"?\"\;\n"; my $output = "set output \"$dopeout\"\;\n"; my $plot = "plot \"$targdat\" with lines, \"$tempdat\" with lines\;\n"; my @plotset = ($terminal,$xlabel,$ylabel,$missing,$output,$plot); # print "@plotset\n"; my $plotscript = "plotscript.txt"; open(PLOTSCRIPT, ">$plotscript"); print PLOTSCRIPT "@plotset"; close (PLOTSCRIPT); my $finalscript = "plotscript.txt"; # open(FINALSCRIPT, $finalscript); # # @gnuplotscript = <FINALSCRIPT>; `$gnuplot $finalscript`; print "The DOPE plot is in $dopeout\n"; }