Re: [modeller_usage] read_al_373E> Protein specified in ALIGN_CODES(i) was not found in the alignment file
To:
Subject: Re: [modeller_usage] read_al_373E> Protein specified in ALIGN_CODES(i) was not found in the alignment file
From: Andrea Spinelli <>
Date: Sat, 13 Jan 2018 20:05:03 +0100
I try to use python3, but it doesn't working.
I can't understand why if I use my modeller module file in python
command prompt, it works. If I use the function in a script, not.
In attached file there is the main file (from_fasta_to_best_coverage.py)
from which I call the module function (model.py). The file called
'down_fasta_if_not_PDB' is used in the first part of main file and It works
To test it, I use the uniprot code P02489.
Please, help me
Andrea
On 12/01/18 20:08, Téletchéa Stéphane wrote:
Le 12/01/2018 à 17:34, Andrea Spinelli a écrit :
The error appears only when I try to use the script function
encapsulated in a module file, calling it the function from another
python file.
Andrea
Dear Andrea,
The very same python version (not 2 in one case and 3 in the other ?).
Pay attention to encoding, dashes for instance are sometimes
misinterpreted and not handled properly in python2...
Best,
Stéphane
import wget
import os
def query_PDB():
ID_uniprot = raw_input('Which uniprot ID? ')
if not os.path.exists(str(ID_uniprot)+'.fasta'):
my_fasta = obtain_fasta(id_uniprot=ID_uniprot)
else:
my_fasta = str(ID_uniprot)+'.fasta'
print "fasta found"
return my_fasta
def obtain_fasta(id_uniprot):
"""
Download the fasta file if there is no Uniprot ID PDB
"""
# import uniprot txt protein
url = 'http://www.uniprot.org/uniprot/'+id_uniprot+'.txt'
try:
prt = wget.download(url,bar = None)
prt_open = open(prt,'r')
except:
e = sys.exc_info()[0]
write_to_page( "<p>Error: %s</p>" % e )
# structure control
if 'PDB' in prt_open.read():
print 'PDB found'
else:
# download fasta
url_fasta = 'http://www.uniprot.org/uniprot/'+id_uniprot+'.fasta'
try:
prt_fasta = wget.download(url_fasta,bar = None)
print 'fasta file downloaded'
except:
e = sys.exc_info()[0]
write_to_page( "<p>Error: %s</p>" % e )
return str(prt_fasta)
from modeller import *
from modeller.automodel import *
import os
def salign():
""" Illustrates the SALIGN multiple structure/sequence alignment """
env = environ()
env.io.atom_files_directory = os.getcwd()
aln = alignment(env)
for (code, chain) in (('2klr', 'A'), ('3l1e', 'A')):
mdl = model(env, file=code, model_segment=('FIRST:'+chain, 'LAST:'+chain))
aln.append_model(mdl, atom_files=code, align_codes=code+chain)
for (weights, write_fit, whole) in (((1., 0., 0., 0., 1., 0.), False, True),
((1., 0.5, 1., 1., 1., 0.), False, True),
((1., 1., 1., 1., 1., 0.), True, False)):
aln.salign(rms_cutoff=3.5, normalize_pp_scores=False,
rr_file='$(LIB)/as1.sim.mat', overhang=30,
gap_penalties_1d=(-450, -50),
gap_penalties_3d=(0, 3), gap_gap_score=0, gap_residue_score=0,
dendrogram_file='P02489.tree',
alignment_type='tree', # If 'progresive', the tree is not
# computed and all structues will be
# aligned sequentially to the first
feature_weights=weights, # For a multiple sequence alignment only
# the first feature needs to be non-zero
improve_alignment=True, fit=True, write_fit=write_fit,
write_whole_pdb=whole, output='ALIGNMENT QUALITY')
aln.write(file='template.pap', alignment_format='PAP')
aln.write(file='template.ali', alignment_format='PIR')
aln.salign(rms_cutoff=1.0, normalize_pp_scores=False,
rr_file='$(LIB)/as1.sim.mat', overhang=30,
gap_penalties_1d=(-450, -50), gap_penalties_3d=(0, 3),
gap_gap_score=0, gap_residue_score=0, dendrogram_file='1is3A.tree',
alignment_type='progressive', feature_weights=[0]*6,
improve_alignment=False, fit=False, write_fit=True,
write_whole_pdb=False, output='QUALITY')
def align_multi():
env = environ()
env.libs.topology.read(file='$(LIB)/top_heav.lib')
# Read aligned structure(s):
aln = alignment(env)
aln.append(file='template.ali', align_codes='all')
aln_block = len(aln)
print 'ho fatto append dei template'
print aln_block
# Read aligned sequence(s):
aln.append(file='P02489.ali', alignment_format='PIR', align_codes='all')
# Structure sensitive variable gap penalty sequence-sequence alignment:
aln.salign(output='', max_gap_length=20,
gap_function=True, # to use structure-dependent gap penalty
alignment_type='PAIRWISE', align_block=aln_block,
feature_weights=(1., 0., 0., 0., 0., 0.), overhang=0,
gap_penalties_1d=(-450, 0),
gap_penalties_2d=(0.35, 1.2, 0.9, 1.2, 0.6, 8.6, 1.2, 0., 0.),
similarity_flag=True)
aln.write(file='P02489-mult.ali', alignment_format='PIR')
aln.write(file='P02489-mult.pap', alignment_format='PAP')
def model_multi():
env = environ()
a = automodel(env, alnfile='P02489-mult.ali',
knowns=('2klrA','3l1eA'), sequence='P02489')
a.starting_model = 1
a.ending_model = 5
a.make()
# Get a list of all successfully built models from a.outputs
ok_models = filter(lambda x: x['failure'] is None, a.outputs)
# Rank the models by DOPE score
key = 'molpdf'
ok_models.sort(lambda a,b: cmp(a[key], b[key]))
# Get top model
m = ok_models[0]
print "Top model: %s (molpdf %.3f)" % (m['name'], m[key])
return str(m['name'])
def make_model():
print 'eseguita salign'
salign()
print 'eseguita align'
align_multi()
print 'medellizzazione'
pdb_structure = model_multi()
return pdb_structure
import os, sys, wget
import down_fasta_if_not_PDB
from Bio import SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
from modeller import *
import model
def filter_coverage_per_score(result_handle_xml):
"""
Filter the best coverage aglinment for each score range.
Are excluded the aglinmensts with score less than 50.
"""
result = []
best_align_len_200 = 0
best_align_len_80_200 = 0
best_align_len_50_80 = 0
best_align_200 = None
best_align_80_200 = None
best_align_50_80 = None
for record in NCBIXML.parse(result_handle_xml):
if record.alignments: #skip queries with no matches
for align in record.alignments:
for hps in align.hsps:
if hps.bits >= 200:
if align.length >= best_align_len_200:
best_align_len_200 = align.length
best_align_200 = align
elif hps.bits < 200 and hps.bits > 80:
if align.length >= best_align_len_80_200:
best_align_len_80_200 = align.length
best_align_80_200 = align
elif hps.bits < 80 and hps.bits > 50:
if align.length >= best_align_len_50_80:
best_align_len_50_80 = align.length
best_align_50_80 = align
if best_align_200: result.append(best_align_200)
if best_align_80_200: result.append(best_align_80_200)
if best_align_50_80: result.append(best_align_50_80)
return result
def filter_align_length(best_aligns_coverage_per_score):
result = []
for i in range(0,len(best_aligns_coverage_per_score)-1):
if best_aligns_coverage_per_score[i].length*(1.2) <= best_aligns_coverage_per_score[i+1].length:
result.append(best_aligns_coverage_per_score[i])
result.append(best_aligns_coverage_per_score[i+1])
break # two aglinments are enough
else:
result.append(best_aligns_coverage_per_score[i])
return result
def print_align(best_align_score_length):
for align in best_align_score_length:
for hps in align.hsps:
# hsp.align_length / query.query_length
coverage = round(float(len(hps.sbjct))/float(query_seq_len)*100,1)
print '-----------------------'
print 'title:'
t = str(align.title).split('>')
chain, id_chain, title_id = reshape_title(t[0])
print chain
print 'length: %s' %align.length
print 'coverage: %s' %coverage
print 'score: %s' %hps.bits
print 'query: %s' %hps.query
print 'match: %s' %hps.match
print 'sbjct: %s' %hps.sbjct
return chain, id_chain, title_id
def reshape_title(title):
""" reshape title of fasta"""
title_fasta = str(title).split('>')
title_chain = []
title_id_chain = []
title_id = []
for t in title_fasta:
title_chain_el = t.split('|')
title_id_s = title_chain_el[3]
title_id_name = title_id_s+'_'+title_chain_el[4][0]
title_chunk_start = '|'.join(title_chain_el[t] for t in range(0,2))
title_description = title_chain_el[4][1:]
title_chunk = title_chunk_start + '|' + title_id_name + '|' + title_description
title_chain.append(title_chunk)
title_id_chain.append(title_id_name)
title_id.append(title_id_s)
return title_chain, title_id_chain, title_id
def pir_export(fasta_object):
fasta_ID = str(fasta_object.id).split('|')[1]
pir_file = open(fasta_ID+'.ali', mode='w')
seq = fasta_object.seq
pir_file.write('>P1;'+fasta_ID+'\n')
pir_file.write('sequence:'+fasta_ID+':::::::0.00: 0.00 \n')
pir_file.write(str(seq)+'*')
pir_file.close
print 'ALI file created'
return pir_file
def download_PDB(best_align_score_length):
name_pdb = []
print 'PDB downloading'
for align in best_align_score_length:
t = str(align.title).split('>')[0]
title_chain, title_id_chain, title_id = reshape_title(t)
url = 'https://files.rcsb.org/download/'+str(title_id[0])+'.pdb'
prt = wget.download(url,bar = None)
prt_open = open(prt,'r')
return name_pdb
################
# -- script -- #
################
""" searcing of best template of BLAST"""
# download fasta file of Uniprot ID if PDB does not exists
my_fasta = down_fasta_if_not_PDB.query_PDB()
# set query sequence from fasta
my_query = SeqIO.read(my_fasta,format = 'fasta')
query_seq_len = len(my_query.seq)
# parse the result via xml result
try:
print 'Blast interrogation'
result_handle_xml = NCBIWWW.qblast("blastp","pdb",my_query.seq,format_type='XML')
except:
e = sys.exc_info()[0]
print( "<p>Error: %s</p>" % e )
# find the longest alignments for each score range
best_aligns_coverage_per_score = filter_coverage_per_score(result_handle_xml)
if best_aligns_coverage_per_score == []:
print "matched sequences has a too low score"
# find the best alignments
best_align_score_length = filter_align_length(best_aligns_coverage_per_score)
# print the results
title_chain, title_id_chain, title_id = print_align(best_align_score_length)
""" saving phase """
# download the aligned fasta sequences
PDB_template = download_PDB(best_align_score_length)
# export the PIR query sequence
query_seq_pir = pir_export(my_query)
print query_seq_pir
""" Modeller """
a = model.make_model()
print a