#!/usr/bin/python
# This file is part of ModPipe, Copyright 1997-2010 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

"""Collate results of StrucImpact.pl model builds."""

from optparse import OptionParser
import glob, os, os.path, re, sys

import modpipe.version
import modpipe.serialize
import modpipe.pdbutils
import modeller

# Turn on/off residue-by-residue calculations.

residue_by_residue_f = True

# ------------------------------------------------------------------------------
def main():
    global pdbrep, chain_length_file, chain_length_table, new_chains, best_dir, \
           i_scores

    # Parse command line options.

    parser = OptionParser(version=modpipe.version.message())

    parser.set_usage("""
 Parse and collate results from StrucImpact.pl model builds.
 Output to standard out.

   %prog [options] """)

    # Set defaults.

    parser.set_defaults(rundir='',
                     chain_length_file='',
                     exclude_hits_mode='0000')

    # Populate options list.

    parser.add_option("-r", "--run_directory",
                 dest="rundir",
                 type='string',
                 help="""Run directory - contains subdirectory "data" of
                 model results (required)""",
                 metavar="DIR")
    pdb = modpipe.pdbutils.get_pdb_repository()
    parser.add_option("-p", "--pdb_repository",
                 dest="pdbrep",
                 help="""Directories containing PDB files (default: """ \
                      + pdb + ')', default=pdb,
                 metavar="DIR")
    parser.add_option("-c", "--chain_length_file",
                 dest="chain_length_file",
                 help="""File containing lengths of PDB chains (required - new
                 data will be added to the file; initialize with empty file).""",
                 metavar="FILE")
    parser.add_option("-x", "--exclude",
                 dest="exclude_hits_mode",
                 help="""Template hits by method that will not be included
                 in results.  Four-letter code containing 1 (exclude)
                 or 0 (keep) for each of Seq-Seq, Prf-Seq, Prf-Prf, and
                 Seq-Prf, respectively.  For example, to exclude
                 Prf-Prf: 0010.  (default: 0000)""",
                 metavar="HITS_MODE")

    # Check arguments.

    opts, args = parser.parse_args()

    if not opts.rundir:
        parser.print_help()
        sys.exit(1)

    if not opts.chain_length_file:
        parser.print_help()
        sys.exit(1)

    # --exclude_hits_mode must be four characters, 0s or 1s.

    if not re.search("^[01]{4}$", opts.exclude_hits_mode):
        parser.error( \
                    "--exclude_hits_mode must be four characters, 0s and 1s only")
        sys.exit

    # Check that files/directories exist.

    rundir = opts.rundir
    if not os.path.isdir(rundir):
        parser.error("Could not find run directory " + rundir)

    pdbrep = opts.pdbrep

    chain_length_file = opts.chain_length_file
    if not os.access(chain_length_file, os.R_OK | os.W_OK):
        parser.error("Do not have read-write access to file " \
                      + chain_length_file)

    # Show parameter settings.

    #      ----+----1----+---
    print "Run directory:    ", rundir
    if opts.exclude_hits_mode != "0000":
        #      ----+----1----+---
        print "Exclude hits mode:", opts.exclude_hits_mode
        print

    # ---------------------------------------------------------------------------
    # Read chain lengths.

    new_chains = 0
    chain_lengths = open(chain_length_file, "r").readlines()

    # Save in hash.

    chain_length_table = {}
    for chain_length in chain_lengths:
        chain_length = chain_length.strip()
        fields = chain_length.split()
        pdbid_chain = fields[0]
        length = int(fields[1])
        chain_length_table[pdbid_chain] = length


    # ---------------------------------------------------------------------------
    # Fields and heading.

    i_fields = [8, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 28]
    #            ----+----1----
    heading = [' Align e-value',   #  0
               '   MOD obj fct',   #  1
               '          DOPE',   #  2
               '       DOPE-HR',   #  3
               ' GA341 mod ass',   #  4
               ' GA341 compact',   #  5
               ' GA341 pairwis',   #  6
               ' GA341 surface',   #  7
               ' GA341 combind',   #  8
               ' GA341 pair. Z',   #  9
               ' GA341 surf. Z',   # 10
               ' GA341 comb. Z',   # 11
               ' Normalzd DOPE',   # 12
               ' Seq. identity',   # 13
               ' ModPipe qual.']   # 14

    # "Extra" field.

    heading.append(' Frac template')   # 15

    n_fields = len(i_fields)

    # Indicator - 1: "best" is larger (e.g., sequence identity, Modeller quality
    #                score)
    #            -1: "best" is smaller (e.g., e-value, DOPE)

    #             0   1    2    3    4    5    6    7    8    9   10
    best_dir = [-1., -1., -1., -1.,  1.,  1., -1., -1., -1., -1., -1.,
                -1., -1. , 1.,  1.]

    # For extra field.

    best_dir.append(1.)

    # Which results/scores to tally.

    i_scores = [13, 14]
    n_scores = len(i_scores)

    # Identify the new template - PDB ID and chain, and MD5 hash.

    pattern4 = "%s/????.unq" % rundir
    pattern5 = "%s/?????.unq" % rundir
    unq_file = glob.glob(pattern4) + glob.glob(pattern5 )
    if not unq_file:
        print "Could not find new structure .unq file in", rundir
        sys.exit(1)

    if len(unq_file) > 1:
        print "More than one .unq file in", rundir
        sys.exit(1)

    # Read file.  Line looks like:
    #               d72f5d7656e4bd5d479e8ffe5915c09fDMLPEEAP :  3e9vA

    unq_file = unq_file[0]
    line = open(unq_file, "r").readline()
    fields = line.split()
    template_md5 = fields[0]
    template_pdbid_chain = fields[2]
    ###print template_md5, template_pdbid_chain


    # Create list of .mod files -- one for each target (where target is a full-
    # UniProt match to the template of interest).  Go through full-UniProt
    # profile to screen out target identical to template-of-interest (in case not
    # screened already).

    modfiles = create_modfile_list( rundir, template_md5 )
    n_modfiles = len(modfiles)
    sys.stderr.write("n_modfiles: %s\n" % n_modfiles)

    # Loop over .mod files / targets.

    n_template_better = []
    sum_template_better = []
    n_template_only = 0
    for i in i_scores:
        n_template_better.append(0)
        sum_template_better.append(0.0)

    n_fraction_template_used_gt_50pct = 0
    n_acceptable_ga341_e_value = 0
    n_acceptable_ga341 = 0
    n_acceptable_e_value = 0
    n_not_acceptable = 0
    n_targets = 0

    # If flag set, counts across targets of residues modeled best by template-of-
    # interest vs. other templates.

    if residue_by_residue_f:
        sum_res_template_best = []
        sum_res_template_best_sq = []
        n_res_template_best = []

        sum_res_non_template_best = []
        sum_res_non_template_best_sq = []
        n_res_non_template_best = []

        n_res_not_modeled = 0
        for i in range(n_scores):
            sum_res_template_best.append(0.0)
            sum_res_template_best_sq.append(0.0)
            n_res_template_best.append(0)

            sum_res_non_template_best.append(0.0)
            sum_res_non_template_best_sq.append(0.0)
            n_res_non_template_best.append(0)

    # Two histograms of maximum sequence identity each target -- with and
    # without prf-seq hits, if available.

    seq_identity_cats \
                 = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 200.0]
    hists = []
    hists.append(  [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0])
    hists.append(  [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

    n_template_greater_seq_identity = 0

    #      ----+----1----+-
    print "                ",
    for i in i_scores:
        print "%14s" % heading[i],

    print

    for modfile in modfiles:
        modfile_bname = os.path.basename(modfile)
        sys.stderr.write(modfile_bname + "\n")

        # Read list of model objects.

        fp = open(modfile, "r")
        models = modpipe.serialize.read_models_file(fp)

        # Min, max, average for everything.

        best_results = []
        template_results = []
        sum_results = []
        for i in i_scores:
            best_init = -1.0e6 * best_dir[i]
            best_results.append(best_init)
            template_results.append(best_init)
            sum_results.append(0.0)

        best_template_seq_identity = -1.0e6
        template_e_value = 1.0e6
        template_ga341 = -1.0e6
        fraction_template_used = -1.0e6

        max_seq_identity_pct = -1.0e6
        max_seq_identity_pct_x0100 = -1.0e6

        residue_best = []

        n_models = 0

        for model in models:

            # Skip templates that are essentially identical to the current target.

            seq_identity_pct = float(model.highest_sequence_identity)
            if seq_identity_pct >= 99.0:
                sys.stderr.write("Skipping " + os.path.basename(modfile) + " " \
                                  + model.templates[0].code \
                                  +  model.templates[0].chain \
                                  + " seq_identity_pct " + `seq_identity_pct` \
                                  + "\n")
                continue

            # Do not include templates found by excluded hits_modes.

            fold_assignment_method = model.fold_assignment_method
            if exclude(fold_assignment_method, opts.exclude_hits_mode):
                sys.stderr.write("Excluding: %s\n" % modfile_bname)
                continue

            # Do not include template of interest in statistics. (It's shown
            # separately).
            # Identify template of interest.  Field 30 contains PDB ID and chain,
            # beginning, end, and seqid, blank separated.

            pdbid = model.templates[0].code
            chain = model.templates[0].chain
            current_pdbid_chain = pdbid + chain

            # Want the fraction of the PDB template used.  Get the length of the
            # chain from the PDB file.  See if available already in database
            # table.

            current_template_beg = model.templates[0].region[0]
            current_template_end = model.templates[0].region[1]
            current_template_used = current_template_end - current_template_beg + 1
            current_template_len = get_chain_length(current_pdbid_chain)

            fraction_current_template_used = float(current_template_used) \
                                             / float (current_template_len)

            # Read results into list -- will choose subset based on i_scores.

            result_set = get_result_set(model)
            if current_pdbid_chain == template_pdbid_chain:

                # Keep template beg, end positions in target from template
                # with highest sequence identity (in case duplicates from
                # different hit modes).

                this_template_seq_identity = float(model.highest_sequence_identity)
                if this_template_seq_identity > best_template_seq_identity:
                    best_template_seq_identity = this_template_seq_identity
                    template_beg = model.region[0]
                    template_end = model.region[1]

                # Best E-value and GA341.

                this_template_e_value = float(model.alignment.evalue)
                if this_template_e_value < template_e_value:
                    template_e_value = this_template_e_value

                this_template_ga341 = float(model.score.ga341.total)
                if this_template_ga341 > template_ga341:
                    template_ga341 = this_template_ga341

                # Also, highest fraction template used.

                if fraction_current_template_used > fraction_template_used:
                    fraction_template_used = fraction_current_template_used

                # Save best of each score among templates-of-interest

                for i, i_score in enumerate(i_scores):
                    result_i = result_set[i_score]

                    best_dir_i = best_dir[i_score]
                    if best_dir_i * result_i > best_dir_i * template_results[i]:
                        template_results[i] = result_i

            else:

                # Not template of interest.  Add on to statistics.  Do for selected
                # subset of results.

                results = []
                for i_score in i_scores:
                    result_i = result_set[i_score]
                    results.append(result_i)


                for i, i_score in enumerate(i_scores):
                    best_dir_i = best_dir[i_score]
                    if best_dir_i * results[i] > best_dir_i * best_results[i]:
                        best_results[i] = results[i]

                    sum_results[i] += results[i]

                # If flag set, also keep a target_residue-by-target_residue
                # record of which template has best scores (since different
                # templates cover different portions of the target).

                if residue_by_residue_f:
                    if not residue_best:
                        target_length = model.sequence.length
                        for i in range(target_length):
                            residue_best.append( [] )

                    target_beg = model.region[0]
                    target_end = model.region[1]
                    residue_tally(residue_best, results, target_beg, target_end,
                                  current_pdbid_chain)

                n_models += 1

            # Keep record of top percent sequence identity across all templates,
            # both including and excluding prf-seq hits.

            if seq_identity_pct > max_seq_identity_pct:
                max_seq_identity_pct = seq_identity_pct

            if fold_assignment_method[1] == "0":
                if seq_identity_pct > max_seq_identity_pct_x0100:
                    max_seq_identity_pct_x0100 = seq_identity_pct


        if best_template_seq_identity == -1.0e6:
            msg = "Did not find %s in %s\n" % (template_pdbid_chain, modfile_bname)
            sys.stderr.write(msg)
            continue

        # Histograms of top sequence identity, with and without prf-seq hits.

        hist_tally(hists[0], seq_identity_cats, max_seq_identity_pct)
        hist_tally(hists[1], seq_identity_cats, max_seq_identity_pct_x0100)
        n_targets += 1

        if n_models == 0:

            # Template of interest used as a template; nothing else found to use as
            # a template.  Note so, and keep separate total.

            print os.path.basename(modfile)
            print "%s is only available template" % template_pdbid_chain
            n_template_only += 1
            continue

        # Calculate averages.

        rn_models = float(n_models)
        avg_results = []
        for i in range(n_scores):
            avg_results.append(sum_results[i]/rn_models)

        # If flag set, count number of residues where template of interest is best
        # for each score, number of residues where other template is best, and
        # remaining (unmodeled) residues.

        if residue_by_residue_f:
            for i_residue in range(target_length):
                ok_f = False
                for i, i_score in enumerate(i_scores):
                    if i_residue >= template_beg-1 and i_residue < template_end:

                        # In the range of template-of-interest aligned residues.
                        # Count where template-of-interest is best.  First see if
                        # there is a non-template-of-interest template residue
                        # available for this target residue.

                        best_dir_i = best_dir[i_score]
                        if residue_best[i_residue]:
                            ok_f = template_results[i] * best_dir_i \
                                             >= residue_best[i_residue][i] * best_dir_i
                        else:

                            # No non-template of interest residue available.

                            ok_f = True
                    else:

                        # Outside of range of template-of-interest aligned residues.
                        # Count as non-template-of-interest better as long as such a
                        # residue available.

                        ok_f = False

                    if ok_f:
                        sum_res_template_best[i] += template_results[i]
                        sum_res_template_best_sq[i] \
                                            += template_results[i]*template_results[i]
                        n_res_template_best[i] += 1

                        # DKTEMP
                        if residue_best[i_residue] :
                            sum_res_non_template_best[i] += residue_best[i_residue][i]
                            sum_res_non_template_best_sq[i] \
                               += residue_best[i_residue][i]*residue_best[i_residue][i]
                            n_res_non_template_best[i] += 1
                        else :
                            if i == 0:
                                n_res_not_modeled += 1
                    else:
                        # DKTEMP
                        pass
                        #if residue_best[i_residue] :
                        #   sum_res_non_template_best[i] += residue_best[i_residue][i]
                        #   sum_res_non_template_best_sq[i] \
                        #      += residue_best[i_residue][i]*residue_best[i_residue][i]
                        #   n_res_non_template_best[i] += 1
                        #else :
                        #   if i == 0:
                        #      n_res_not_modeled += 1



        # Print results for this target.

        print os.path.basename(modfile)

        if n_models:
            print "Average         ",
            for i in range(n_scores):
                print "%14.3f" % avg_results[i],

            print
            #      ----+----1----+-
            print "Best            ",
            for i in range(n_scores):
                print "%14.3f" % best_results[i],

            print

        #      ----+----1----+-
        print "%5s           " % template_pdbid_chain,
        for i in range(n_scores):
            print "%14.3f" % template_results[i],

        print
        print

        #if residue_by_residue_f:
        #   #      ----+----1----+-
        #   print "Nres %5s best " % template_pdbid_chain,
        #   for i in range(n_scores):
        #      print "%14d" % n_res_template_best[i],
        #   print
        #   template_aligned_length = template_end - template_beg + 1
        #   print "(alignment: %d/%d target residues)" \
        #         % (template_aligned_length, target_length)
        #
        #print

        # Add to summary across targets -- # where template of interest is
        # lower/greater than others.  Calculate by how much.

        if n_models:
            for i, i_score in enumerate(i_scores):
                best_dir_i = best_dir[i_score]
                if best_dir_i * template_results[i] >= best_dir_i * best_results[i]:
                    n_template_better[i] += 1
                    sum_template_better[i] \
                              += best_dir_i*( template_results[i] - best_results[i] )

        # Keep counts of sequences modeled with template of interest.

        if fraction_template_used > 0.5:
            n_fraction_template_used_gt_50pct += 1
            if template_ga341 > 0.7 and template_e_value < 0.0001:
                n_acceptable_ga341_e_value += 1
            elif template_ga341 > 0.7:
                n_acceptable_ga341 += 1
            elif template_e_value < 0.0001:
                n_acceptable_e_value += 1
            else:
                n_not_acceptable += 1


    # Do a final save of chain_lengths (if any new).

    if new_chains != 0:
        save_chain_lengths()

    # Print summary across targets.

    print "Out of", n_targets, "targets, number where..."
    print "                ",
    for i in i_scores:
        print "%14s" % heading[i],

    print
    print "%5s is best   " % template_pdbid_chain,
    for i in range(n_scores):
        print "%14d" % (n_template_better[i] + n_template_only),

    print
    print "Better by (avg) ",
    for i in range(n_scores):
        if n_template_better[i] != 0:
            avg = sum_template_better[i] / float(n_template_better[i])
        else:
            avg = 0.0

        print "%14.3f" % avg,

    print

    if residue_by_residue_f:
        print
        print "Target residues..."
        print "Best by %s" % template_pdbid_chain
        print "   N            ",
        for i in range(n_scores):
            print "%14d" % n_res_template_best[i],

        print

        print "   Mean         ",
        for i in range(n_scores):
            avg = 0.0
            if n_res_template_best[i] != 0:
                avg = sum_res_template_best[i] / float(n_res_template_best[i]),

            print "%14.3f" % avg,

        print

        print "   Std dev      ",
        for i in range(n_scores):
            n = float(n_res_template_best[i])
            std_dev = 0.0
            if n != 0:
                x_bar = sum_res_template_best[i] / n
                est_var = (sum_res_template_best_sq[i] - n*x_bar*x_bar)/(n - 1.0)
                std_dev = est_var**0.5

            print "%14.3f" % std_dev,

        print

        print "Same residues, not best by %s" % template_pdbid_chain
        print "   N            ",
        for i in range(n_scores):
            print "%14d" % n_res_non_template_best[i],

        print

        print "   Mean         ",
        for i in range(n_scores):
            avg = 0.0
            if n_res_non_template_best[i] != 0:
                avg = sum_res_non_template_best[i] \
                                                 / float(n_res_non_template_best[i])

            print "%14.3f" % avg,

        print

        print "   Std dev      ",
        for i in range(n_scores):
            n = float(n_res_non_template_best[i])
            std_dev = 0.0
            if n != 0:
                x_bar = sum_res_non_template_best[i] / n
                est_var = (sum_res_non_template_best_sq[i] - n*x_bar*x_bar)/(n - 1.0)
                std_dev = est_var**0.5

            print "%14.3f" % std_dev,

        print

        print "Same, not modeld",
        for i in range(n_scores):
            print "%14d" % n_res_not_modeled,

        print

    print
    print "Of %s targets modeled using template %s..." \
           % (n_targets, template_pdbid_chain)
    print "Number using > 50%% of template %5s      %5d" \
          % (template_pdbid_chain, n_fraction_template_used_gt_50pct)
    print "Of those models..."
    print "   Number acceptable by GA341 and e-value %5d" \
          % n_acceptable_ga341_e_value
    print "   Number acceptable only by GA341        %5d" \
          % n_acceptable_ga341
    print "   Number acceptable only by e-value      %5d" \
          % n_acceptable_e_value
    print "   Number not acceptable                  %5d" \
          % n_not_acceptable

    print
    print
    #      ----+-
    #            ----+-
    print "Number of targets where maximum sequence identity among templates is..."
    print "                           ",
    print "  <10%  <20%  <30%  <40%  <50%  <60%  <70%  <80%  <90% <=100% Total"
    print "All templates              ",
    for hist_cat in hists[0]:
        print "%5d" % hist_cat,

    print "%7d" % sum(hists[0])

    print "Excluding prf-seq templates",
    for hist_cat in hists[1]:
        print "%5d" % hist_cat,

    print "%7d" % sum(hists[1])


    # Indicator of best sequence identity (template of interest or not) by
    # residue.

    """
    print
    print "Target length:", target_length
    print "%s models target residues %s-%s" \
           % (template_pdbid_chain, template_beg, template_end)

    template_sequence_identity = template_results[13]
    print template_pdbid_chain, "sequence identity:", template_sequence_identity
    n_res_template_best = 0
    for i in range(template_beg-1, template_end):
       if template_sequence_identity > residue_best[i].score:
          n_res_template_best += 1

    print "No. of residues where", template_pdbid_chain, " has best sequence " \
          + "identity:", n_res_template_best

    for i in range(target_length):
       print i, residue_best[i].score, residue_best[i].pdbid, \
             residue_best[i].modeled_length

    if n_res_template_best :
       print "Positions (t=template best, -=other best, ?=not modeled):"
       for i in range( 10, target_length, 10 ):
          sys.stdout.write( "%10d" % i )

       print

       for i in range(target_length):
          score_i = residue_best[i].score
          if i >= template_beg-1 and i < template_end \
             and template_sequence_identity > score_i:
             sys.stdout.write("t")
          elif score_i == 0.0:
             sys.stdout.write("?")
          else:
             sys.stdout.write("-")

       print
    """


# ------------------------------------------------------------------------------
class Residue:
    def __init__(self):
        self.score = 0
        self.pdbid = ''
        self.modeled_length = 0


# ------------------------------------------------------------------------------
def get_result_set(model):

    result_set = []
    result_set.append(float(model.alignment.evalue))               #  0
    result_set.append(float(model.score.objfunc))                  #  1
    result_set.append(float(model.score.dope))                     #  2
    result_set.append(float(model.score.dope_hr))                  #  3
    result_set.append(float(model.score.dope_hr))                  #  4
    result_set.append(float(model.score.ga341.total))              #  5
    result_set.append(float(model.score.ga341.distance))           #  6
    result_set.append(float(model.score.ga341.surface_area))       #  7
    result_set.append(float(model.score.ga341.combined))           #  8
    result_set.append(float(model.score.ga341.z_distance))         #  9
    result_set.append(float(model.score.ga341.z_surface_area))     # 10
    result_set.append(float(model.score.ga341.z_combined))         # 11
    result_set.append(float(model.score.normalized_dope))          # 12
    result_set.append(float(model.highest_sequence_identity))      # 13
    result_set.append(float(model.score.quality))                  # 14

    return result_set


# ------------------------------------------------------------------------------
def create_modfile_list( rundir, template_md5 ):

    """Create list of .mod files -- one for each target (where target is a full-
    UniProt match to the template of interest).  Go through full-UniProt
    profile to screen out target identical to template-of-interest (in case not
    screened already).
    """

    prefix = template_md5[0:3]
    uniprot_profile_file = "%s/data/%s/%s/sequence/%s-uniprot.prf" \
                           % (rundir, prefix, template_md5, template_md5)
    screened_ids = get_ids_from_prf(uniprot_profile_file)
    n_screened_ids = len(screened_ids)
    sys.stderr.write("n_screened_ids: %s\n" % n_screened_ids)

    # Use .unq file to translate to md5 IDs.

    uniprot_unq_file = "%s/%s-profile_fullsequences.unq" \
                       % (rundir, template_md5)
    target_md5s = translate_ids(uniprot_unq_file, screened_ids)
    n_target_md5s = len(target_md5s)
    sys.stderr.write("n_target_md5s %s\n" % n_target_md5s)

    # Create list of files.  Limit to available files -- in case calculations are
    # in progress, or in case ModPipe skipped some (too short, etc.).

    pattern = "%s/data/???/*/sequence/*.mod" % rundir
    available_modfiles = glob.glob(pattern)
    #      ----+----1----+---
    print "Available targets:", len(available_modfiles)
    print

    modfiles = []
    for target_md5 in target_md5s:
        prefix = target_md5[0:3]
        modfile = "%s/data/%s/%s/sequence/%s.mod" % \
                  (rundir, prefix, target_md5, target_md5)
        if modfile in available_modfiles:
            modfiles.append(modfile)
        else :
            sys.stderr.write("screened out: %s\n" % os.path.basename(modfile))

    return modfiles


# ------------------------------------------------------------------------------
def get_chain_length(pdbid_chain):

    """Try to retrieve chain length from current table.  If not there, get from
     pdb and add to table.
     """

    global chain_length_file, chain_length_table, new_chains

    chain_length = chain_length_table.get(pdbid_chain, 0)
    if chain_length == 0:
        chain_length = length_from_pdb(pdbid_chain)
        chain_length_table[pdbid_chain] = chain_length

        # Every so often save to file.

        new_chains += 1
        if new_chains % 20 == 0:
            save_chain_lengths()

    return chain_length


# ------------------------------------------------------------------------------
def save_chain_lengths():
    global chain_length_file, chain_length_table

    fp = open(chain_length_file, "w")
    for pdbid_chain, length in chain_length_table.iteritems():
        fp.write("%s %s\n" % (pdbid_chain, length))

    fp.close()


# ------------------------------------------------------------------------------
def length_from_pdb(pdbid_chain):
    """Read pdb file, count resolved residues in chain (ATOM records)."""
    global pdbrep

    pdbid    = pdbid_chain[0:4]
    pdbchain = pdbid_chain[4]

    env = modeller.environ()
    env.io.atom_files_directory = pdbrep

    chain = modpipe.pdbutils.fetch_PDB_chain(env, pdbid, pdbchain)
    return len(chain.residues)


# ------------------------------------------------------------------------------
def get_ids_from_prf(prf_file):

    """Go through profile file, return list of IDs, screening out those identical
    (99%+ sequence identity) to target.
    """

    ids = []
    fp = open(prf_file, "r")

    # Skip first six info lines plus target (template of interest)

    for i in range(7):
        fp.readline()

    while 1:
        line = fp.readline()
        if not line:
            break

        # Skip if sequence identity >= 99%.

        fields = line.split()
        seq_identity_pct = float(fields[10])
        if seq_identity_pct >= 99.0:
            continue

        id = fields[1]
        ids.append(id)

    fp.close()

    return ids


# ------------------------------------------------------------------------------
def translate_ids(unq_file, uniprot_ids):

    """ Return list of md5 IDs corresponding to UniProt IDs."""

    # Read unq file lines.  Create hash.  unq clusters near-identical
    # sequences -- indicates multiple UniProt IDs.

    md5s_table = {}
    lines = open(unq_file, "r").readlines()
    for line in lines:
        line = line.strip()
        fields = line.split(":")
        md5 = fields[0].strip()
        ids  = fields[1].strip().split()
        for id in ids:
            md5s_table[id] = md5

    # Create hash of md5s (so can avoid duplicates).

    md5s_hash = {}
    for uniprot_id in uniprot_ids:
        md5 = md5s_table.get(uniprot_id, '')
        if md5:
            md5s_hash[md5] = ''
        else:
            sys.stderr.write("Could not find in .unq: " + uniprot_id + "\n")

    # Return list.

    return md5s_hash.keys()


# ------------------------------------------------------------------------------
def exclude( hits_mode, exclude_hits_mode ) :

    """Check current template "fold assignment method" (hits_mode) and
       exclude hits found by methods/modes specified in command-line
       argument.
    """

    # Quick look -- exclude nothing if still default.

    if exclude_hits_mode == "0000":
        return False

    # Go through characters.

    for i in range(4):
        if exclude_hits_mode[i] != "0" and hits_mode[i] != "0" :
            return True

    return False



# ------------------------------------------------------------------------------
def residue_tally( residue_best, results, target_beg, target_end, pdbid ):

    """Keep a target_residue-by-target_residue record of which
    template has best scores (since different templates cover
    different portions of the target).
    """
    global best_dir, i_scores

    # Do for each result.

    for i_residue in range(target_beg-1, target_end):

        # Initialize if not there.

        if not residue_best[i_residue]:
            for i_score in i_scores:
                best_dir_i = best_dir[i_score]
                residue_best[i_residue].append(-1.0e6 * best_dir_i)

        for i, i_score in enumerate(i_scores):
            score = results[i]
            best_dir_i = best_dir[i_score]
            if score * best_dir_i > residue_best[i_residue][i] * best_dir_i:
                residue_best[i_residue][i] = score


# ------------------------------------------------------------------------------
def hist_tally(hist, cats, datum):

    """Add to histogram tally."""

    n_cats = len(cats)
    for i_cat in range(n_cats):
        if datum >= cats[i_cat]:
            continue

        else:
            hist[i_cat] += 1
            break


# ------------------------------------------------------------------------------
if __name__ == "__main__":
    main()
