#!/usr/bin/envF python2.3
################################################################################
#
#       This file is part of the GQL (Graphical Query Language) Toolkit
#
#       file:   GQLQMixture.py
#       author: Alexander Schliep (alexander@schliep.org) and
#
#       Copyright (C) 2003-2004 Alexander Schliep
#                                   
#       Contact: alexander@schliep.org           
#
#       Information: http://ghmm.org/gql
#
#   GQL is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   GQL is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with GQL; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
#
#       This file is version $Revision: 1265 $ 
#                       from $Date: 2006-11-09 10:07:15 -0300 (Thu, 09 Nov 2006) $
#             last change by $Author: filho $.
#
################################################################################
from ghmm import *
import numpy.oldnumeric as Numeric
import math
import getopt, sys, string
import copy
import random
import mixture
import mixtureHMM

def Entropy(prob_dist):
    """ Returns Entropy for the discrete prob. distribution

        Entropy is according to natural logarithm (default)
        or log with the specified base """

    result = 0.0
    for i in xrange(len(prob_dist)):
        # we have to deal with zero probs
        p = prob_dist[i]
        if p > 0.0:
            result -= p * math.log(p)
        # p == 0.0 Note: lim_(x->0) x log(x) = 0
    return result


def sumlogs(a):
    """ Given a Numeric.array a of log p_i, return log(sum p_i)

        Uses (assuming p_1 is maximal):
        log(\Sum p_i) = log(p_1) + log( 1 + \Sum_{i=2} exp(log(p_i) - log(p_1)))

        NOTE: The sumlogs functions returns the sum for values != -Inf

        XXX should be coded in C, check whether part of Numeric
    """
    m = max(a) # Maximal value must not be unique
    result = 0.0
    #minus_infinity = -float(1E300)
    minus_infinity = -float('Inf')
    for x in a:
        if x >= m: # For every maximal value
            result += 1.0
        else:
            if x == minus_infinity: # zero probability, hence
                # -Inf log prob. Doesnt contribute
                continue
            x = x - m
            # Special case to avoid numerical problems
            if x < -1.0e-16: # <=> |x| >  1.0e-16
                result += math.exp(x) # XXX use approximation
            else: # |x| <  1.0e-16 => exp(x) = 1
                result += 1.0

    result = math.log(result)
    result += m
    return result

def printMessage(string):
    print string


def estimate_mixture_ext(mixtureModel, data, max_iter, eps, progress=None):
    """ External implementation of the mixture estimation method
        to override the old implementation bellow
    """
    minus_infinity = -float('Inf')
    [log_l,log_p] = mixture.MixtureModel.EM(mixtureModel,data, max_iter, eps)
    return (log_p, mixtureModel.pi.tolist(), Numeric.transpose(Numeric.exp(log_l)))

       
def estimate_mixture(models, seqs, max_iter, eps, fixed_models, alpha=None,
                     progress=None, previous_weights=None, constraints=None,
                     neg_constraints=None, prior_weight=0, prior_neg_weight=0,
                     prior_type=0,gibsiterations=1,constraintsNorm=0):


    """ Given a Python-list of models and a SequenceSet seqs
        perform an nested EM to estimate maximum-likelihood
        parameters for the models and the mixture coefficients.
        The iteration stops after max_iter steps or if the
        improvement in log-likelihood is less than eps.

        alpha is a Numeric of dimension len(models) containing
        the mixture coefficients. If alpha is not given, uniform
        values will be chosen.

        If previous_type!=0, EM with constraints is performed. For this we need
        a NxN constraints matrix, the last posterior estimation used for the models
        assigment and the weights of the priors. prior_type=1 uses only positive
        constraints, prior_type=2 only negative constraints and prior_type 3 both.

	outputMethod - method to print messages ...

        Result: The models are changed in place. Return value
        is (l, alpha, P) where l is the final log likelihood of
        seqs under the mixture, alpha is a Numeric of
        dimension len(models) containing the mixture coefficients
        and P is a (#sequences x #models)-matrix containing
        P[model j| sequence i]

    """
    done = 0
    iter = 1
    norm = 0.0
    #minus_infinity = -float(1E300)
    minus_infinity = -float('Inf')
    last_mixture_likelihood = None
    reestimation_logalpha_cutoff = math.log(10.0 / len(seqs))
    # The (nr of seqs x nr of models)-matrix holding the likelihoods
    l = Numeric.zeros((len(seqs), len(models)), Numeric.Float)
    
    if alpha == None: # Uniform alpha
        logalpha = Numeric.ones(len(models), Numeric.Float) * \
                   math.log(1.0/len(models))
    else:
        # normalizing the alphas which are not fixed
#        for i in xrange(len(models)):
#	    if i in fixed_models: # do not change the alpha of fixed values
#		norm += alpha[i]
#	print norm
#        for i in xrange(len(models)):
#	    if not i in fixed_models: # only for not fixed alphas
#	        alpha[i] = alpha[i]/(1-norm)
	logalpha = Numeric.log(alpha)

    #print logalpha, Numeric.exp(logalpha)
    print logalpha
    print Numeric.exp(logalpha)
    print 'models', models 


    if prior_type != 0:        
        # peform mixture estimation with constraints
        # check paramenters !!!
        assert prior_weight  >= 0.0 or prior_neg_weight >= 0.0 
        print prior_weight
        #constraints = Numeric.array(constraints,Numeric.Float)
        

    log_nrseqs = math.log(len(seqs))

    while 1:
        # Score all sequences with all models
        for i, m in enumerate(models):
            loglikelihood = m.loglikelihoods(seqs)
            #print "#model %d min(loglklhsd)=%f max()=%f" % (i, min(loglikelihood),max(loglikelihood))

            # NOTE: loglikelihood might contain -Inf for sequences which cannot be built
            # Numeric slices: l[:,i] is the i-th column of l
            l[:,i] = Numeric.array(loglikelihood)    
        if prior_type == 0: # regular mixture estimation 
          for i in xrange(len(seqs)):
              # Leaves -Inf values unchanged            
              l[i] += logalpha # l[i,k] = log( a_k * P[seq i| model k])

          # Compute P[model j| seq i]
          mixture_likelihood = 0.0
          for i in xrange(len(seqs)):

            # We want to compute \sum_{k} a_k P[seq i| model k]
            # from the log( a_k * P[seq i| model k]) we have
            # NOTE: The sumlogs functions returns the sum for values != -Inf
            seq_logprob = sumlogs(l[i])
            # By subtracting the log of the sum we divide it and obtain
            # a prob dist.
            l[i] -= seq_logprob # l[i] = ( log P[model j | seq i] )
            mixture_likelihood += seq_logprob

          #print l
          
          # NOTE: Numeric.exp gives underflow warnings when computing exp
          # for values of  -7.1e2 and smaller. We set them to -Inf manually
          # (exp(-7.1e2) ~ 4.4762862256751298e-309 anyways
          l = Numeric.where(l > -4.8e2, l, minus_infinity)
          l_exp = Numeric.exp(l) # XXX Use approx with table lookup
          
        else: # mixture estimation with constraints
          # Compute P[model j| seq i]
          alpha = Numeric.exp(Numeric.where(logalpha > -4.8e2, logalpha, minus_infinity)) 
          mixture_likelihood = 0.0
          for n in range(gibsiterations): # replicating the assignement steps
            indices = range(len(seqs))
            if prior_type != 0: # if constraints learning, suffle order of posterior calculations
              random.shuffle(indices)              
            for x,i in enumerate(indices):
              # Leaves -Inf values unchanged
              pen = Numeric.zeros(len(models),Numeric.Float)
              penn = Numeric.zeros(len(models),Numeric.Float) 
              for y,j in enumerate(indices):
                  # calculating penalities
                  # in a Gibbs sampling manner (using either previous or current posteriors
                  coc = None
                  if y > x: # if posterior not yet calculated, used of ones                      
                    coc =  Numeric.multiply(previous_weights[j],1.0)
                    if prior_type == 1 or prior_type == 3:
                      if constraints[i][j] > 0.0:
                        if constraintsNorm:
                          pen += Numeric.divide(Numeric.multiply(1-coc, constraints[i][j]),((1-alpha)))
                        else:
                          pen += Numeric.multiply(1-coc, constraints[i][j])                          
                    if prior_type == 2 or prior_type == 3:
                      if neg_constraints[i][j] > 0.0:
                        if constraintsNorm:
                          penn += Numeric.divide(Numeric.multiply(coc, neg_constraints[i][j]),alpha)
                        else:
                          penn += Numeric.multiply(coc, neg_constraints[i][j])
                  elif y < x:                      
                    coc =  Numeric.multiply(Numeric.exp(l[j]),1)
                    if prior_type == 1 or prior_type == 3:
                      if constraints[i][j] > 0.0:
                        if constraintsNorm:
                          pen += Numeric.divide(Numeric.multiply(1-coc, constraints[i][j]),((1-alpha)))
                        else:
                          pen += Numeric.multiply(1-coc, constraints[i][j])
                    if prior_type == 2 or prior_type == 3:
                        if neg_constraints[i][j] > 0.0:
                          penn += Numeric.divide(Numeric.multiply(coc, neg_constraints[i][j]),alpha)
                        else:
                          penn += Numeric.multiply(coc, neg_constraints[i][j])   
              #print pen
              l[i] += (logalpha - Numeric.multiply(pen,prior_weight) - Numeric.multiply(penn,prior_neg_weight))                           # l[i,k] = log( a_k * P[W+|y] * P[W-|y] * P[seq i| model k])

              seq_logprob = sumlogs(l[i]) 
              l[i] -= seq_logprob # l[i] = ( log P[model j | seq i] )
              mixture_likelihood += seq_logprob

              # NOTE: Numeric.exp gives underflow warnings when computing exp
              # for values of  -7.1e2 and smaller. We set them to -Inf manually
              # (exp(-7.1e2) ~ 4.4762862256751298e-309 anyways
              l[i] = Numeric.where(l[i] > -4.8e2, l[i], minus_infinity)

            l_exp = Numeric.exp(l) # XXX Use approx with table lookup
            previous_weights = l_exp # keep l_exp for Gibbs sampling

        #if prior_weight != 0:
          
        
        # NOTE: exp(-Inf) = 0.0 in l_exp
        #print "exp(l)", l_exp

        row_sums = Numeric.sum(Numeric.transpose(l_exp),0)
        if abs(1.0 - min(row_sums)) < 1e-10 and abs(max(row_sums) - 1.0) < 1e-10:
#            print "l_exp row sums are all one"
             print "l_exp row sums are all one"
        else:
            print row_sums
            print row_sums

#        print "# iter %s joint likelihood = %f" % (iter, mixture_likelihood)
#        outputMethod("# iter %s joint likelihood = %f" % (iter, mixture_likelihood))

	norm = 0.0

        # Compute priors alpha P(Model i|Sequences)
        for i in xrange(len(models)):
            # NOTE: The sumlogs functions returns the sum for values != -Inf
	    if not i in fixed_models: # do not change the alpha of fixed values
	        logalpha[i] = sumlogs(l[:,i]) - log_nrseqs
    	    norm += pow(2,logalpha[i])

        # Normalizing the prior (given the fixed models)
        print norm
        print logalpha

        logalpha_exp = []
        for i in xrange(len(models)):
	    if not i in fixed_models: # only for not fixed alphas
                if logalpha[i] != minus_infinity:
	            logalpha[i] = math.log(pow(2,logalpha[i])/norm,2)
                    logalpha_exp.append(pow(2,logalpha[i]))

        print "logalpha", logalpha, logalpha_exp

        # Decide whether we want to go on or not
        if max_iter == 0:
            break
        if last_mixture_likelihood == None: # First time through while-loop
            last_mixture_likelihood = mixture_likelihood
        else:
            improvement = mixture_likelihood - last_mixture_likelihood
            if iter > max_iter or (0.0 < improvement and improvement < eps):
                break

        for j, m in enumerate(models):
            # Set the sequence weight for sequence i under model m to P[m| i]
            # NOTE: If model m is really unpopular this can lead to numerical
            # instabilities. Rescale the weight vector, so that it sums to unity
            # This doesnt solve the problem
            # More generally: if s below is really tiny we should neither
            # reestimate nor use that model in the calculations
            #
            w = copy.deepcopy(l_exp[:,j])
            s = Numeric.sum(w,0)
#            print "weight sum=%e min=%e max=%e" % (s, min(w), max(w))
            print "weight sum=%e min=%e max=%e" % (s, min(w), max(w))

            if s < 1e-200: # This case cannot be handled due to limited range.
                # In a log-based implementation we would have enough precision
                # to still train the corresponding model. Here we dont
                #
                # XXX Possible fix. Train model with all sequences equally weighted?
                # Still produces problems in the BW-implementation ... (has too)
                #
                print "# unnecessary model %d in mixture" % j
                #"# unnecessary model %d in mixture" % j)
                # Increase variances ???
                #w = Numeric.ones(len(w),Numeric.Float)
                continue
            else:
                # NOTE: scaling might throw underflow (bug in Numeric?)
                # might have been caused too small values in l, which got
                # propagated to real tiny ones in l_exp. Seems okay now
                w /= s
                print "scaled sum=%e min=%e max=%e" % (Numeric.sum(w,0), min(w), max(w))
                #outputMethod("scaled sum=%e min=%e max=%e" % (Numeric.sum(w,0), min(w), max(w)))

            if not j in fixed_models:
                for i in xrange(len(seqs)):
                    seqs.setWeight(i,w[i])
#                print " Reestimating model", j
                print " Reestimating model %i"%j
                m.baumWelch(seqs, 20, 0.0001)

        # update the progress bar
        if progress != None:
            status = (100*iter/max_iter)
            print "prg", status
            progress.updateProgress(status)
        
        #printMessage("Iteration %d: Likelihood:%d"%(iter,mixture_likelihood))
        iter += 1

        #print "exp_l min=%f max=%f" % (min(min(l_exp)), max(max(l_exp)))
        last_mixture_likelihood = mixture_likelihood
       
    return (mixture_likelihood, Numeric.exp(logalpha).tolist(), l_exp)

def estimate_mixture_partials(models, seqs, max_iter, eps, fixed_models, partial_labels, alpha=None, progress=None):
    """ Given a Python-list of models and a SequenceSet seqs
        perform an nested EM to estimate maximum-likelihood
        parameters for the models and the mixture coefficients.
        The iteration stops after max_iter steps or if the
        improvement in log-likelihood is less than eps.

        alpha is a Numeric of dimension len(models) containing
        the mixture coefficients. If alpha is not given, uniform
        values will be chosen.

        Result: The models are changed in place. Return value
        is (l, alpha, P) where l is the final log likelihood of
        seqs under the mixture, alpha is a Numeric of
        dimension len(models) containing the mixture coefficients
        and P is a (#sequences x #models)-matrix containing
        P[model j| sequence i]

        Using partial labels
    """
    done = 0
    iter = 1
    minus_infinity = -float('Inf')
    last_mixture_likelihood = None
    reestimation_logalpha_cutoff = math.log(10.0 / len(seqs))
    # The (nr of seqs x nr of models)-matrix holding the likelihoods
    l = Numeric.zeros((len(seqs), len(models)), Numeric.Float)
    print len(models)
    if alpha == None: # Uniform alpha
        logalpha = Numeric.ones(len(models), Numeric.Float) * \
                   math.log(1.0/len(models))
    else:
        logalpha = Numeric.log(alpha)
    print logalpha, Numeric.exp(logalpha)
    log_nrseqs = math.log(len(seqs))

    while 1:
        # Score all sequences with all models
        for i, m in enumerate(models):
            loglikelihood = m.loglikelihoods(seqs)
            #print "#model %d min(loglklhsd)=%f max()=%f" % (i, min(loglikelihood),max(loglikelihood))

            # NOTE: loglikelihood might contain -Inf for sequences which cannot be built
            # Numeric slices: l[:,i] is the i-th column of l
            l[:,i] = Numeric.array(loglikelihood)

        #print l
        for i in xrange(len(seqs)):
            # Leaves -Inf values unchanged
            l[i] += logalpha # l[i,k] = log( a_k * P[seq i| model k])
        #print l

        # Compute P[model j| seq i]
        mixture_likelihood = 0.0
        for i in xrange(len(seqs)):
            # We want to compute \sum_{k} a_k P[seq i| model k]
            # from the log( a_k * P[seq i| model k]) we have
            # NOTE: The sumlogs functions returns the sum for values != -Inf
            seq_logprob = sumlogs(l[i])
            # By subtracting the log of the sum we divide it and obtain
            # a prob dist.
            l[i] -= seq_logprob # l[i] = ( log P[model j | seq i] )
            mixture_likelihood += seq_logprob
        #print l

        # NOTE: Numeric.exp gives underflow warnings when computing exp
        # for values of  -7.1e2 and smaller. We set them to -Inf manually
        # (exp(-7.1e2) ~ 4.4762862256751298e-309 anyways
        l = Numeric.where(l > -4.8e2, l, minus_infinity)

        l_exp = Numeric.exp(l) # XXX Use approx with table lookup
        # NOTE: exp(-Inf) = 0.0 in l_exp
        #print "exp(l)", l_exp

        row_sums = Numeric.sum(Numeric.transpose(l_exp),0)
        if abs(1.0 - min(row_sums)) < 1e-10 and abs(max(row_sums) - 1.0) < 1e-10:
            print "l_exp row sums are all one"
        else:
            print row_sums

        print "# iter %s joint likelihood = %f" % (iter, mixture_likelihood)

        # Compute priors alpha
        for i in xrange(len(models)):
            # NOTE: The sumlogs functions returns the sum for values != -Inf
            logalpha[i] = sumlogs(l[:,i]) - log_nrseqs

        logalpha_exp = Numeric.exp(logalpha)
        print "logalpha", logalpha, min(logalpha_exp), max(logalpha_exp),logalpha_exp


	# Partially supervised training
        # For sequences with partial information use a weight vector s.t.
        # P[model|seq] = 1 if model = label[seq] and 0 else
        # XXX Ideally we would not want to compute all the other
        # stuff for labelled sequences in the first place
        for seq in partial_labels.keys():
            p_vec = Numeric.zeros(len(models), Numeric.Float)
            p_vec[partial_labels[seq]] = 1.0
            l_exp[seq] = p_vec

	# Decide whether we want to go on or not
        if max_iter == 0:
            break
        if last_mixture_likelihood == None: # First time through while-loop
            last_mixture_likelihood = mixture_likelihood
        else:
            improvement = mixture_likelihood - last_mixture_likelihood
            if iter > max_iter or (0.0 < improvement and improvement < eps):
                break


	for j, m in enumerate(models):
            # Set the sequence weight for sequence i under model m to P[m| i]
            # NOTE: If model m is really unpopular this can lead to numerical
            # instabilities. Rescale the weight vector, so that it sums to unity
            # This doesnt solve the problem
            # More generally: if s below is really tiny we should neither
            # reestimate nor use that model in the calculations
            #
            w = copy.deepcopy(l_exp[:,j])
            s = Numeric.sum(w,0)
            print "weight sum=%e min=%e max=%e" % (s, min(w), max(w))

            if s < 1e-200: # This case cannot be handled due to limited range.
                # In a log-based implementation we would have enough precision
                # to still train the corresponding model. Here we dont
                #
                # XXX Possible fix. Train model with all sequences equally weighted?
                # Still produces problems in the BW-implementation ... (has too)
                #
                print "# unnecessary model %d in mixture" % j
                # Increase variances ???
                #w = Numeric.ones(len(w),Numeric.Float)
                continue
            else:
                # NOTE: scaling might throw underflow (bug in Numeric?)
                # might have been caused too small values in l, which got
                # propagated to real tiny ones in l_exp. Seems okay now
                w /= s
                print "scaled sum=%e min=%e max=%e" % (Numeric.sum(w,0), min(w), max(w))

            if not j in fixed_models:
                for i in xrange(len(seqs)):
                    seqs.setWeight(i,w[i])
                print " Reestimating model", j
                m.baumWelch(seqs, 20, 0.1)

         # update the progress bar
        if progress != None:
            status = (100*iter/max_iter)
            print "prg", status
            progress.updateProgress(status)               

        iter += 1
        #print "exp_l min=%f max=%f" % (min(min(l_exp)), max(max(l_exp)))
        last_mixture_likelihood = mixture_likelihood

    return (mixture_likelihood, Numeric.exp(logalpha), l_exp)


def estimate_clustering(models, seqs, max_iter, eps, fixed_models, progress=None):
    """ Given a Python-list of models and a SequenceSet seqs
        perform an nested EM to estimate maximum-likelihood
        parameters for the models and a ML cluster assignment.
        The iteration stops after max_iter steps or if the
        improvement in log-likelihood is less than eps.

        Result: The models are changed in place. Return value
        is (l, P) where l is the final log likelihood of
        seqs under the mixture and P is a (#sequences x #models)-matrix
        containing P[model j| sequence i]
    """
    done = 0
    iter = 1
    #minus_infinity = -float(1E300)
    minus_infinity = -float('Inf')
    last_clustering_likelihood = None

    # The (nr of seqs x nr of models)-matrix holding the likelihoods
    print len(seqs), len(models)
    l = Numeric.zeros((len(seqs), len(models)), Numeric.Float)
    assignment = [None] * len(seqs)

    while 1:
        # Score all sequences with all models
        for i, m in enumerate(models):
            loglikelihood = m.loglikelihoods(seqs)             
            # NOTE: loglikelihood might contain -Inf for sequences which cannot be built
            # Numeric slices: l[:,i] is the i-th column of l
            l[:,i] = Numeric.array(loglikelihood)

        clustering_likelihood = 0.0
        for i in xrange(len(seqs)):
            assignment[i] = int(Numeric.argmax(l[i]))
            clustering_likelihood += l[i,assignment[i]]

        print "# iter %s joint likelihood = %f" % (iter, clustering_likelihood)

        # Decide whether we want to go on or not
        if max_iter == 0:
            break
        if last_clustering_likelihood == None: # First time through while-loop
            last_clustering_likelihood = clustering_likelihood
        else:
            improvement = clustering_likelihood - last_clustering_likelihood
            if iter > max_iter or (0.0 < improvement and improvement < eps):
                break

        for j, m in enumerate(models):
            if not j in fixed_models:
                for i in xrange(len(seqs)):
                    # XXX Hack. Use SequenceSet.getSubset() instead
                    if assignment[i] == j:
                        seqs.setWeight(i, 1.0)
                    else:
                        seqs.setWeight(i, 0.0)
                print " Reestimating model", j
                m.baumWelch(seqs, 20, 0.1)

          # update the progress bar
        if progress != None:
            status = (100*iter/max_iter)
            print "prg", status
            progress.updateProgress(status)              

        iter += 1
        last_clustering_likelihood = clustering_likelihood

    return (clustering_likelihood, assignment, l)




def decode_mixture(P, entropy_cutoff):
    """ Given P, a (#sequences x #models)-matrix where P_{ij} =
        P[model j| sequence i] return a list of size (#sequences)
        which contains j, the model which maximizes  P[model j| sequence i]
        for a sequence, if the entropy of the discrete distribution
        { P[ . | sequence i] } is less than the entropy_cutoff
        and None else.
    """
    #print P
    nr_seqs = Numeric.shape(P)[0]
    #nr_seqs = len(P)
    result = [None] * nr_seqs
    for i in xrange(nr_seqs):
        e = Entropy(P[i])
        #print e, P[i]
        if e < entropy_cutoff:
            #print "i", i, "Argmax ", Numeric.argmax(P[i])
            mx = P[i,0]
            mxind = 0
            #print mx, mxind
            for j in range(1,len(P[i])):
                #print mx, mxind
                if P[i,j] > mx:
                    mx = P[i,j]
                    mxind = j
            result[i] = mxind
    return result




usage_info = """
GQLMixture.py [options] hmms.smo seqs.sqd [hmms-reestimated.smo]

Estimate a mixture of hmms from a file hmms.smo containing continous
emission HMMs and a set of sequences of reals given in the file
seqs.sqd.

Running:

-m iter Maximal number of iterations (default is 100)

-e eps  If the improvement in likelihood is below eps, the training
        is terminated (default is 0.001)

Post-analyis (the options are mutually exclusive):

-p      Output the matrix p_{ij} = P[model j| sequence i] to the console

-c      Cluster sequences. Assign each sequence i to the model maximizing
        P[model j| sequence i]. Outputs seq_id\tcluster_nr to the console

-d ent  Decode mixture. If the entropy of { P[model j| sequence i] } is
        less than 'ent', sequence i is assigned to the model maximizing
        P[model j| sequence i]. Outputs seq_id\tcluster_nr to the console,
        cluster_nr is None if no assignment was possible


Example:

GQLMixture.py -m 10 -e 0.1 -d 0.15 test2.smo test100.sqd reestimated.smo

"""

def usage():
    print usage_info

if __name__ == '__main__':
    # Default values
    max_iter = 100
    eps = 0.001
    output = None

    try:
        opts, args = getopt.getopt(sys.argv[1:], "m:e:pcd:", [])
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    for o, a in opts:
        if o in ['-m']:
            max_iter = int(a)
        if o in ['-e']:
            eps = float(a)
        if o in ['-p']:
            output = 'p_matrix'
        if o in ['-c']:
            output = 'cluster'
        if o in ['-d']:
            output = 'decode'
            entropy_cutoff = float(a)

    if len(args) != 3:
        usage()
        sys.exit(2)

    hmmsFileName = args[0]
    seqsFileName = args[1]
    outFileName = args[2]

    models = HMMOpen(hmmsFileName)
    print "# Read %d models from '%s'" % (len(models), hmmsFileName)
    seqs = SequenceSet(Float(), seqsFileName)
    print "# Read %d sequences from '%s'" % (len(seqs), seqsFileName)
    (ml, alpha, P) = estimate_mixture(models, seqs, max_iter, eps, [])

    if output != None:
        if output == 'p_matrix':
            for i in xrange(len(seqs)):
                print string.join(map(lambda x:"%1.3f" % x, P[i]), '\t')
        else:
            if output == 'cluster':
                assignment = decode_mixture(P, len(models)) # max ent: log(len(models))
            else:
                assignment = decode_mixture(P, entropy_cutoff)
            for i, c in enumerate(assignment):
                print "%s\t%s" % (str(i), str(c))

