#!/usr/bin/python
# This file is part of ModPipe, Copyright 1997-2010 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

from modeller import *
from modpipe.alnutils import *
from modpipe.sequtils import *
from modpipe.resutils import *
from optparse import OptionParser
import modpipe.version
import sys, os, re, tempfile

log.minimal()
env = environ()

debug_f = False

def main():

    # Parse command line options.

    parser = OptionParser(version=modpipe.version.message())

    parser.set_usage("""
This script takes a file of sequences and determines which are unique
within a specified percent identity.  It returns a new file or files
of unique sequences.

   %prog [options] """)

    # Set defaults.

    parser.set_defaults(infile='',
                        fileformat='PIR',
                        outfile='',
                        percent_id_cutoff=90)

    # Populate options list.

    parser.add_option("-i", "--input_file",
                 dest="infile",
                 type='string',
                 help="""File containing sequences to process (required)""",
                 metavar="FILE")
    parser.add_option("-f", "--file_format",
                 dest="fileformat",
                 type='string',
                 help="""Format of input and output sequence files
                         PIR | FASTA (default: 'PIR')""",
                 metavar="FORMAT")
    parser.add_option("-o", "--output_file",
                 dest="outfile",
                 type='string',
                 help="""File containing unique sequences
                 (default: each unique sequence is written to its own file)""",
                 metavar="FILE")
    parser.add_option("-p", "--percent_id_cutoff",
                 dest="percent_id_cutoff",
                 type='float',
                 help="""Cutoff below which sequences considered unique
                 (default: 90)""",
                 metavar="PERCENT")

    # Check mandatory options.

    opts, args = parser.parse_args()

    if not opts.infile:
        parser.print_help()
        sys.exit(1)

    if not os.path.isfile(opts.infile):
        parser.error("""Cannot proceed without an input file""")


    # Read sequences.  Create alignment object, read in.

    aln = alignment(env)
    aln.append( file=opts.infile, align_codes='all',
                alignment_format=opts.fileformat )

    # Continue until have zero or one sequence(s) in remaining "outgroup".

    final_aln = alignment(env)
    infile = opts.infile
    n_sequences_written = 0
    prev_code = ''
    seq_no = 0
    while 1 :

        # Final alignment will include first remaining sequence.

        code, sequence_chars = get_sequence_chars( aln, 0 )
        final_aln.append_sequence( sequence_chars )
        if code == prev_code :
            seq_no += 1
            code += `seq_no`
        else :
            prev_code = code

        final_aln[-1].code = code
        n_sequences_written += 1

        # If no output file specified, write to file named by sequence code
        # and clear alignment.

        if not opts.outfile :
            seq_file = code + "." + opts.fileformat.lower()
            final_aln.write( seq_file, alignment_format=opts.fileformat )
            final_aln.clear()

        # Align sequences.

        aln.salign()

        # Write table of percent IDs for check purposes.

        if debug_f :
            matrix_file = '%s.id' % infile
            aln.id_table( matrix_file )

        # Calculate percent IDs -- first sequence with each other sequence.

        percent_ids = calc_percent_ids( aln )
        if debug_f :
            print "percent_ids:", percent_ids

        # Create new alignment consisting of all sequences with percent ID
        # compared to first below cutoff.

        new_aln = alignment(env)
        n_other_seqs = len( percent_ids )
        for i in range( n_other_seqs ) :
            if percent_ids[i] < opts.percent_id_cutoff :
                code, sequence_chars = get_sequence_chars( aln, i+1 )
                new_aln.append_sequence( sequence_chars )
                new_aln[-1].code = code

        # If one or no remaining sequences, done.

        n_remaining_seqs = len( new_aln )
        if n_remaining_seqs <= 1 :
            break
        else :

            # Reassign alignment, repeat.
            aln = new_aln

    # Done.  If single output file, write out final alignment.

    if opts.outfile :
        final_aln.write( opts.outfile, alignment_format=opts.fileformat )

    print
    print "Done.", n_sequences_written, "sequences written."

#----------------------------------------------------------------------------
def get_sequence_chars( aln, i_seq ) :
    alnseq = aln[i_seq]

    code = alnseq.code

    seq = get_alignment_positions( aln.positions, alnseq )
    seq = remove_gaps(seq)
    seq = ''.join(seq)

    return code, seq

#----------------------------------------------------------------------------
def calc_percent_ids( aln ) :

    percent_ids = []
    n_seqs = len( aln )
    first_seq = aln[0]
    for i_seq in range( 1, n_seqs ) :
        percent_id = first_seq.get_sequence_identity( aln[i_seq] )
        percent_ids.append( percent_id )

    return percent_ids


#----------------------------------------------------------------------------
if __name__ == "__main__":
    main()
