[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [modeller_usage] read_al_373E> Protein specified in ALIGN_CODES(i) was not found in the alignment file



I try to use python3, but it doesn't working.

I can't understand why if I use my modeller module file in python command prompt, it works. If I use the function in a script, not.

In attached file there is the main file (from_fasta_to_best_coverage.py) from which I call the module function (model.py). The file called 'down_fasta_if_not_PDB' is used in the first part of main file and It works

To test it, I use the uniprot code P02489.

Please, help me

Andrea


On 12/01/18 20:08, Téletchéa Stéphane wrote:
Le 12/01/2018 à 17:34, Andrea Spinelli a écrit :
The error appears only when I try to use the script function encapsulated in a module file, calling it the function from another python file.

Andrea

Dear Andrea,

The very same python version (not 2 in one case and 3 in the other ?). Pay attention to encoding, dashes for instance are sometimes misinterpreted and not handled properly in python2...

Best,

Stéphane


import wget
import os

def query_PDB():
    ID_uniprot = raw_input('Which uniprot ID? ')
    if not os.path.exists(str(ID_uniprot)+'.fasta'):
        my_fasta = obtain_fasta(id_uniprot=ID_uniprot)
    else:
        my_fasta = str(ID_uniprot)+'.fasta'
        print "fasta found"
    return my_fasta


def obtain_fasta(id_uniprot):
    """
    Download the fasta file if there is no Uniprot ID PDB
    """

    # import uniprot txt protein
    url = 'http://www.uniprot.org/uniprot/'+id_uniprot+'.txt'
    try:
        prt = wget.download(url,bar = None)
        prt_open = open(prt,'r')
    except:
        e = sys.exc_info()[0]
        write_to_page( "<p>Error: %s</p>" % e )


    # structure control
    if 'PDB' in prt_open.read():
        print 'PDB found'
    else:
        # download fasta
        url_fasta = 'http://www.uniprot.org/uniprot/'+id_uniprot+'.fasta'
        try:
            prt_fasta = wget.download(url_fasta,bar = None)
            print 'fasta file downloaded'
        except:
            e = sys.exc_info()[0]
            write_to_page( "<p>Error: %s</p>" % e )


    return str(prt_fasta)
from modeller import *
from modeller.automodel import *
import os

def salign():


    """ Illustrates the SALIGN multiple structure/sequence alignment """

    env = environ()
    env.io.atom_files_directory = os.getcwd()

    aln = alignment(env)
    for (code, chain) in (('2klr', 'A'), ('3l1e', 'A')):
        mdl = model(env, file=code, model_segment=('FIRST:'+chain, 'LAST:'+chain))
        aln.append_model(mdl, atom_files=code, align_codes=code+chain)

    for (weights, write_fit, whole) in (((1., 0., 0., 0., 1., 0.), False, True),
                                        ((1., 0.5, 1., 1., 1., 0.), False, True),
                                        ((1., 1., 1., 1., 1., 0.), True, False)):
        aln.salign(rms_cutoff=3.5, normalize_pp_scores=False,
                    rr_file='$(LIB)/as1.sim.mat', overhang=30,
                    gap_penalties_1d=(-450, -50),
                    gap_penalties_3d=(0, 3), gap_gap_score=0, gap_residue_score=0,
                    dendrogram_file='P02489.tree',
                    alignment_type='tree', # If 'progresive', the tree is not
                                      # computed and all structues will be
                                      # aligned sequentially to the first
                    feature_weights=weights, # For a multiple sequence alignment only
                                        # the first feature needs to be non-zero
                    improve_alignment=True, fit=True, write_fit=write_fit,
                    write_whole_pdb=whole, output='ALIGNMENT QUALITY')

    aln.write(file='template.pap', alignment_format='PAP')
    aln.write(file='template.ali', alignment_format='PIR')

    aln.salign(rms_cutoff=1.0, normalize_pp_scores=False,
            rr_file='$(LIB)/as1.sim.mat', overhang=30,
            gap_penalties_1d=(-450, -50), gap_penalties_3d=(0, 3),
            gap_gap_score=0, gap_residue_score=0, dendrogram_file='1is3A.tree',
            alignment_type='progressive', feature_weights=[0]*6,
            improve_alignment=False, fit=False, write_fit=True,
            write_whole_pdb=False, output='QUALITY')


def align_multi():
    env = environ()
    env.libs.topology.read(file='$(LIB)/top_heav.lib')

    # Read aligned structure(s):
    aln = alignment(env)
    aln.append(file='template.ali', align_codes='all')
    aln_block = len(aln)


    print 'ho fatto append dei template'
    print aln_block
    # Read aligned sequence(s):
    aln.append(file='P02489.ali', alignment_format='PIR', align_codes='all')

    # Structure sensitive variable gap penalty sequence-sequence alignment:
    aln.salign(output='', max_gap_length=20,
               gap_function=True,   # to use structure-dependent gap penalty
               alignment_type='PAIRWISE', align_block=aln_block,
               feature_weights=(1., 0., 0., 0., 0., 0.), overhang=0,
               gap_penalties_1d=(-450, 0),
               gap_penalties_2d=(0.35, 1.2, 0.9, 1.2, 0.6, 8.6, 1.2, 0., 0.),
               similarity_flag=True)

    aln.write(file='P02489-mult.ali', alignment_format='PIR')
    aln.write(file='P02489-mult.pap', alignment_format='PAP')


def model_multi():
    env = environ()
    a = automodel(env, alnfile='P02489-mult.ali',
                  knowns=('2klrA','3l1eA'), sequence='P02489')
    a.starting_model = 1
    a.ending_model = 5
    a.make()

    # Get a list of all successfully built models from a.outputs
    ok_models = filter(lambda x: x['failure'] is None, a.outputs)

    # Rank the models by DOPE score
    key = 'molpdf'
    ok_models.sort(lambda a,b: cmp(a[key], b[key]))

    # Get top model
    m = ok_models[0]
    print "Top model: %s (molpdf %.3f)" % (m['name'], m[key])
    return str(m['name'])



def make_model():
    print 'eseguita salign'
    salign()
    print 'eseguita align'
    align_multi()
    print 'medellizzazione'
    pdb_structure = model_multi()
    return pdb_structure
import os, sys, wget
import down_fasta_if_not_PDB
from Bio import SeqIO
from Bio.Blast import NCBIWWW, NCBIXML
from modeller import *
import model


def filter_coverage_per_score(result_handle_xml):
    """
    Filter the best coverage aglinment for each score range.
    Are excluded the aglinmensts with score less than 50.
    """

    result  = []
    best_align_len_200 = 0
    best_align_len_80_200 = 0
    best_align_len_50_80 = 0

    best_align_200 = None
    best_align_80_200 = None
    best_align_50_80 = None

    for record in NCBIXML.parse(result_handle_xml):
        if record.alignments:       #skip queries with no matches
            for align in record.alignments:
                for hps in align.hsps:
                    if hps.bits >= 200:
                        if align.length >= best_align_len_200:
                            best_align_len_200 = align.length
                            best_align_200 = align

                    elif hps.bits < 200 and hps.bits > 80:
                        if align.length >= best_align_len_80_200:
                            best_align_len_80_200 = align.length
                            best_align_80_200 = align

                    elif hps.bits < 80 and hps.bits > 50:
                        if align.length >= best_align_len_50_80:
                            best_align_len_50_80 = align.length
                            best_align_50_80 = align


    if best_align_200: result.append(best_align_200)
    if best_align_80_200: result.append(best_align_80_200)
    if best_align_50_80: result.append(best_align_50_80)

    return result

def filter_align_length(best_aligns_coverage_per_score):
    result = []
    for i in range(0,len(best_aligns_coverage_per_score)-1):
        if  best_aligns_coverage_per_score[i].length*(1.2) <= best_aligns_coverage_per_score[i+1].length:
            result.append(best_aligns_coverage_per_score[i])
            result.append(best_aligns_coverage_per_score[i+1])
            break   # two aglinments are enough
        else:
            result.append(best_aligns_coverage_per_score[i])
    return result

def print_align(best_align_score_length):
    for align in best_align_score_length:
        for hps in align.hsps:
            # hsp.align_length / query.query_length
            coverage = round(float(len(hps.sbjct))/float(query_seq_len)*100,1)
            print '-----------------------'
            print 'title:'
            t = str(align.title).split('>')
            chain, id_chain, title_id = reshape_title(t[0])
            print chain
            print 'length: %s' %align.length
            print 'coverage: %s' %coverage
            print 'score: %s' %hps.bits
            print 'query: %s' %hps.query
            print 'match: %s' %hps.match
            print 'sbjct: %s' %hps.sbjct
    return chain, id_chain, title_id



def reshape_title(title):
    """ reshape title of fasta"""
    title_fasta = str(title).split('>')

    title_chain = []
    title_id_chain = []
    title_id = []

    for t in title_fasta:
        title_chain_el = t.split('|')

        title_id_s = title_chain_el[3]
        title_id_name = title_id_s+'_'+title_chain_el[4][0]

        title_chunk_start = '|'.join(title_chain_el[t] for t in range(0,2))
        title_description = title_chain_el[4][1:]
        title_chunk = title_chunk_start + '|' + title_id_name + '|' + title_description

        title_chain.append(title_chunk)
        title_id_chain.append(title_id_name)
        title_id.append(title_id_s)

    return title_chain, title_id_chain, title_id


def pir_export(fasta_object):
    fasta_ID = str(fasta_object.id).split('|')[1]
    pir_file = open(fasta_ID+'.ali', mode='w')
    seq = fasta_object.seq
    pir_file.write('>P1;'+fasta_ID+'\n')
    pir_file.write('sequence:'+fasta_ID+':::::::0.00: 0.00 \n')
    pir_file.write(str(seq)+'*')
    pir_file.close
    print 'ALI file created'
    return pir_file


def download_PDB(best_align_score_length):
    name_pdb = []
    print 'PDB downloading'
    for align in best_align_score_length:
        t = str(align.title).split('>')[0]
        title_chain, title_id_chain, title_id = reshape_title(t)

        url = 'https://files.rcsb.org/download/'+str(title_id[0])+'.pdb'
        prt = wget.download(url,bar = None)
        prt_open = open(prt,'r')
    return name_pdb

################
# -- script -- #
################

""" searcing of best template of BLAST"""

# download fasta file of Uniprot ID if PDB does not exists
my_fasta = down_fasta_if_not_PDB.query_PDB()

# set query sequence from fasta
my_query = SeqIO.read(my_fasta,format = 'fasta')
query_seq_len = len(my_query.seq)

# parse the result via xml result
try:
    print 'Blast interrogation'
    result_handle_xml = NCBIWWW.qblast("blastp","pdb",my_query.seq,format_type='XML')
except:
    e = sys.exc_info()[0]
    print( "<p>Error: %s</p>" % e )


# find the longest alignments for each score range
best_aligns_coverage_per_score = filter_coverage_per_score(result_handle_xml)
if best_aligns_coverage_per_score == []:
    print "matched sequences has a too low score"

# find the best alignments
best_align_score_length = filter_align_length(best_aligns_coverage_per_score)

# print the results
title_chain, title_id_chain, title_id = print_align(best_align_score_length)

""" saving phase """
# download the aligned fasta sequences
PDB_template = download_PDB(best_align_score_length)

# export the PIR query sequence
query_seq_pir = pir_export(my_query)
print query_seq_pir

""" Modeller """
a = model.make_model()
print a