#!n/usr/bin/env python2.3
################################################################################
#
#       This file is part of the GQL (Graphical Query Language) Toolkit
#
#       file:   GQLCluster.py
#       author: Alexander Schliep (alexander@schliep.org)
#
#       Copyright (C) 2003-2004 Alexander Schliep
#       Contact: alexander@schliep.org
#
#       Information: http://ghmm.org/gql
#
#	GQL is free software; you can redistribute it and/or modify
#       it under the terms of the GNU General Public License as published by
#	the Free Software Foundation; either version 2 of the License, or
#	(at your option) any later version.
#
#	GQL is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License for more details.
#
#	You should have received a copy of the GNU General Public License
#	along with GQL; if not, write to the Free Software
#	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
#
#       This file is version $Revision: 2147 $
#                       from $Date: 2009-11-16 22:37:53 -0300 (Mon, 16 Nov 2009) $
#             last change by $Author: schliep $.
#
################################################################################
#
#
#
#----- Globals -----------------------------------------------------------------
gGQLVersion = 0.6
gGQLBuilddate = "1/15/2003"
gNCBIURL = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=search&db=nucleotide&term=%s[accn]"


#----- Imports -----------------------------------------------------------------
import os
import string
from GQL import ProfileSet,minWithoutMissing,MISSING_DATA
from GQLQuery import GQLQuery, SortOnItem
import ghmm
from GQLMixture import estimate_mixture, decode_mixture, estimate_mixture_partials, \
     estimate_clustering, Entropy, estimate_mixture_ext
from InitialModels import LinearModel, InitialCollection
from EditObjectAttributesDialog import ValidatingInt, ValidatingFloat, ValidatingString, \
     PopupableInt, Popupable
import GQLEvaluation
from GO.GODag import GODag
import GO.Comparirson
import GQLValidation
import math
import numpy.oldnumeric as Numeric
import random
import mixture
import mixtureHMM

#----- FilterFunctions ----------------------------------------------------------

def TwoFoldChange(p,fold):
    # True if the profile exceeds a value of log 2 in magnitude
    # at least once
    return (minWithoutMissing(p) < -fold) or (max(p) > fold)

def TwoFoldChangeTwoSteps(p):
    pass

def NonConstant(p,d):
    return (max(p) - minWithoutMissing(p)) > d

# XXX Should be a singleton class ...
FilterFunctions = {'N-fold change':TwoFoldChange,
                   #'Two-fold change in two steps':TwoFoldChangeTwoSteps,
                   'Non-constant':NonConstant
                   }

minus_infinity = -float('Inf')


#----- Data --- -----------------------------------------------------------------

class ProfileSubSet:
    """Convenience class to represent subsets of a ProfileSet for read-only
       use. Can be used in lieu of a ProfileSet. It is not a subclass of
       ProfileSet by design.
    """
    def __init__(self, profileSet, ids):
        """ids is a list of legal profile identifiers (ints) in profileSet"""
        self.profileSet = profileSet
        # All profile subsets share xrange and yrange with their superset
        # This results in identically scaled sequence plots
        self.xrange = self.profileSet.xrange
        self.yrange = self.profileSet.yrange
        self.classes_no = self.profileSet.classes_no
        self.max_class = self.profileSet.max_class
        self.ids = ids

    def __getitem__(self, i):
        return self.profileSet.profile[self.ids[i]]

    def __len__(self):
      return len(self.ids)

    def len(self):
        return len(self.ids)

    def Info(self, i):
        return self.profileSet.Info(self.ids[i])

    def getclass(self, i):
        return self.profileSet.seq_classes[self.ids[i]]

    def setclass(self, i, classe):
        self.profileSet.seq_classes[self.ids[i]] = classe

    def SortByLikelihood(self, profileClustering, model_id):
        """Return self.ids of the underlying ProfileSet sorted by the
           likelihood under the model
        """
        # XXX How to get the likelihoods for the subset. Recompute??
        # Use GHMM subset thing? Currently sort by P[model|profile]
        ps = map(lambda i: profileClustering.P[i,model_id], self.ids)
        permutation = SortOnItem(range(len(self.ids)), ps)
        return (map(lambda i: self.ids[i], permutation), map(lambda i: ps[i], permutation))

    def getSequenceSet(self):
       """ Returns a list of the sequences in the cluster """
       return self.profileSet.ghmm_seqs.getSubset(self.ids)


class ProfileClustering:
    """Computational context: Contains a set of models, a set of clusters
       and a cluster assignment, if a clustering has been computed prior
       and no changes to the models --- such as delete or add model ---
       have invalidated it.

       It is responsible for model IO. Caller have to supply ProfileSet
    """
    def __init__(self):
        self.type = 'HMM' # type of the model being estimated 'HMM', 'Gaussian', ...
        self.cluster = None # A list where self.cluster[i] is the cluster assigned to i
        self.clusters = None # A list of partition (list of integers)
        self.profileSet = None # The gene expression profile data
        self.models = {} # A dictionary of models
        self.ids = [] # Mapping cluster id (an int) to model ids (whatever)
        self.views = [] # Views on our data (and models).
        self.fixed_models = [] # List of ids of models which will not be trained
        self.modelsFileName = None

        self.defaultAlpha = 1.0
        self.cutoff=0.0
        self.eps = 0.01
        self.alpha = []  # list of alphas (priors) to be used by each model ...
                         #it is only used in the case the user want to fix a prior
	self.entropies = [] # entropies of a sequence given the model assignemt probability
        self.listeners = []

        #parameters for heterogeneous data extensions
        self.mixtureModel = None #object of holding a mixture model from the mixture package, when needed
        #XXX - we should also have data from regulatory regions here soon ...
        
        #parameters for partial labels learning   
        self.partial_label = None
        self.partial_info = None
        self.partials = None

        #parameters of constrainted EM learning
        self.constraints = [None,None] # NxN matrices with positives [0] or negative [1] constraints
        self.prior = 0.0
        self.prior_neg = 0.0
        
        #parameters of constrainted EM learning
        #self.P = Numeric.zeros((len(data), len(models)), Numeric.Float)

    def getNoModels(self):
        if self.type == 'HMM':
          return len(self.models)
        elif self.type == 'Gaussian':
          return self.mixtureModel.G
        else:
          return self.no_clusters
	        
    def setProfileSet(self, profileSet):
        self.profileSet = profileSet
        self.sequenceSet = profileSet.GHMMProfileSet()
        self.invalidateClustering()
        self.missingRate = self.profileSet.missingRate
        self.updateModelsMissingDataEmission()

    def updateModelsMissingDataEmission(self):
        # updating the missing data rate in the models
        # not yet implemented for mixture models
        print self.modelList()
        # XXX
        if self.type == 'HMM':
          for m in self.modelList():
            #print m
            for i in range(m.N-1): # do not include the last state
     
              if isinstance(m,ghmm.ContinuousMixtureHMM):
                (t, mu, s, w) = m.getEmission(i,0)               
                m.setEmission(i,0,t,(mu,s,1-self.missingRate,0))
                (t,mu,s,w) = m.getEmission(i,1)
                m.setEmission(i,1,t,(mu,s,self.missingRate,0))
              elif isinstance(m,ghmm.GaussianMixtureHMM):
                (mu, s, w) = m.getEmission(i,0)
                m.setEmission(i,0,(mu,s,1-self.missingRate))
                (mu,s,w) = m.getEmission(i,1)
                m.setEmission(i,1,(mu,s,self.missingRate))

    def filterProfiles(self, f, param):
        """Only use the ids for which f() is true for clustering. This can be
           used as a hook for doing filtering after reading
        """
	if (self.profileSet == None):
            showwarning("Message","You need to load the data set first")
	else:
		keep_ids = []
		for i, p in enumerate(self.profileSet):
			if f(p, param):
				keep_ids.append(i)
		# it only updates the sequence set ... it should also update the profile set ...
		self.profileSet = self.profileSet.getSubset(keep_ids)
		self.sequenceSet = self.sequenceSet.getSubset(keep_ids)
		print len(self.sequenceSet)
                self.profileSet.missingDataRate()
                self.missingRate = self.profileSet.missingRate
                self.updateModelsMissingDataEmission()


    def randomModels(self,a,cyclic=0,noise=0.0):
        self.type='HMM'
        self.modelsFileName = "<None>"
        del(self.models)
        self.models = {}
        self.fixed_models = []
        self.invalidateClustering()
	self.alpha = []
	self.defaultAlpha = 1.0

        
        for n in eval(a.number_of_states):
            d = float(a.total_duration)
            for i in range(a.number_of_models):
                self.addModel(n, d, a.parameters,
                              mu=a.default_mean, sigma=a.default_variance,cyclic=cyclic,dimension=a.dimension,noise=noise)               
        if a.noiseModel == 1:
          self.addModel(1, d, a.parameters, mu=a.default_mean, sigma=a.default_variance,fix=1)            
  	  self.defaultAlpha = (1.0-0.05)/(len(self.alpha)-1)
	  self.alpha[:] = [self.defaultAlpha]*len(self.alpha[:])
          self.alpha[-1] = 0.05
          self.fixed_models.append(a.number_of_models)
        else:
  	  self.defaultAlpha = 1.0/len(self.alpha)
	  self.alpha[:] = [self.defaultAlpha]*len(self.alpha[:])

        #initializing mixture object
        self.setMixtureFromHMMs()

    def setMixtureFromHMMs(self):
        'internal method to initializa the mixtureModel'
        models = []
        for i in range(self.getNoModels()):
          models.append(mixture.ProductDistribution([mixtureHMM.HMM(self.models[i],20)]))
        fix =  Numeric.zeros(self.getNoModels(),Numeric.Float)
        for i in range(self.getNoModels()):
            if i in self.fixed_models:
              fix[i] = 2
        print self.alpha
        self.mixtureModel = mixture.MixtureModel(self.getNoModels(),self.alpha,models,fix.tolist(),identifiable = 0)
        
    def randomGaussianModels(self,a):
        print a
        self.type='Gaussian'
        self.modelsFileName = "<None>"
        del(self.models)
        self.models = {}
        self.fixed_models = []
        self.invalidateClustering()
	self.alpha = []

	self.defaultAlpha = 1.0

        noclusters = a.number_of_models
        print noclusters

        models = []
        d = len(self.profileSet[0])
        mean = float(a.default_mean)
        sigma = float(a.default_variance)
        for n in range(noclusters):
            aux = []
            for i in range(d):
                aux.append(mixture.NormalDistribution(mean,sigma))
            models.append(mixture.ProductDistribution(aux))

        
        if a.noiseModel == 1:
          noclusters =  noclusters + 1 
          fix = Numeric.zeros(noclusters,Numeric.Float)
          for i in range(d):
            aux.append(mixture.NormalDistribution(mean,sigma))
          models.append(mixture.ProductDistribution(aux))
          self.fixed_models.append(noclusters-1)
        else:
          fix = Numeric.zeros(noclusters,Numeric.Float)
           
        self.defaultAlpha = 1.0/noclusters
        self.alpha[:] = [self.defaultAlpha]*noclusters
        self.mixtureModel = mixture.MixtureModel(noclusters,self.alpha,models,fix.tolist(),identifiable = 1)
        #print self.mixtureModel
          
    def setModels(self, models):
        """models is a list of HMMs (internal)"""
        del(self.models)
        self.type='HMM'
        self.models = {}
        for i,m in enumerate(models):
            self.models[i] = m
        self.fixed_models = []
        self.invalidateClustering()
        self.modelsFileName = "<None>"
        noclusters = i + 1
        self.defaultAlpha = 1.0/noclusters
        self.alpha[:] = [self.defaultAlpha]*noclusters
        self.setMixtureFromHMMs()
        

    def readModels(self, fileName, fileType, append = 0):
        """Reads a *.smo-file containing several models"""
        if fileType == 'xml':            
          models = ghmm.HMMOpenXML(fileName)
        else:
          models = ghmm.HMMOpenSMO(fileName)
        if( models == []):
            return
        #XXX - for now, only HMM are readed
        self.type='HMM'        
        #print "# Read %d models from '%s'" % (fileName)
        newModels = []
        for m in models:
            if ((not isinstance(m,ghmm.GaussianMixtureHMM)) or
                (not isinstance(m,ghmm.ContinuousMixtureHMM))):
                # XXX - hack ... use query to build a model that support missing data
                query = GQLQuery()
                query.loadQuery(m)
                query.newSHMM()
                newModels.append(query.shmm)
            else:
                newModels.append(m)


        newAlpha = []
        for m in newModels:
            newAlpha.append(m.getPrior())
            
        if append:
            self.setModels(self.modelList()+newModels)
	    self.alpha = self.alpha + newAlpha;
        else:
            self.setModels(newModels)
            self.alpha = newAlpha
            
        self.updateModelsMissingDataEmission()     
            
        self.modelsFileName = fileName

        #making sure the alphas sum to 1 (and give a warning otherwise)

        if(sum(self.alpha) != 1.0):            
            print('Warning: alphas do not sun to one. They will be normalized!',self.alpha);
            if (sum(self.alpha) == 0.0):
	      self.defaultAlpha = 1.0/len(self.modelList())
    	      self.alpha[:] = [self.defaultAlpha]*len(self.modelList())
            else:
              self.alpha = Numeric.divide(self.alpha,sum(self.alpha))
              
        self.setMixtureFromHMMs()

        # include self.P computation
        self.max_iter=0;
        self.eps=0.01
        self.computeClustering('MixtureExt');

    def writeModels(self, fileName,fileType):
        """Writes a smo-file"""
        print fileType
        if self.type == 'HMM':
          if self.models != {}:
            ghmm.HMMwriteList(fileName, self.modelList(),fileType=ghmm.GHMM_FILETYPE_XML)
            self.modelsFileName = fileName

    def modelList(self):
        """Returns a list of models. Assume fixed order from now on. The model
           dict self.models is used to allow adding/deleting models freely.

           As a quick hack we supply modelList to allow list level access to
           all the models.

           XXX DEPRECIATED, replace
        """
        self.ids = []
        result = []
        for key in self.models.keys():
            self.ids.append(key)
            result.append(self.models[key])
        return result

    def model(self, id):
        return self.models[id]

    def readyToRun(self,method=''):
        if method=='Kmeans' or method=='Hierarchical' or self.type == 'Hier' or self.type== 'Kmeans':
          return (self.profileSet != None) 
        else:
          return (self.models != {} or self.mixtureModel != None) and (self.profileSet != None)
        #return True

    def dataLoaded(self):
        return (self.profileSet != None)

    def modelLoaded(self):
        return (self.models != {})

    def partialLabelsLoaded(self):
        return (self.partial_label != None)

    def alreadyRunned(self):
        return (self.cluster != None)


    def ReadPartialAssignment(self, fileName, nrModels=0):
        """
        Read labels for genes of known function. The class_label
        below corresponds to the model number (0,...)

        fileName should have the following format

        gene_id\tclass_label[\tcomment]

        XXX ToDo lines starting with '#' are ignored

        """

        print 'test', nrModels
        if nrModels == 0:
          nrModels = self.getNoModels()
        self.partial_label = {}
        self.partial_info = {}
        self.partials = [None] * nrModels

        file = open(fileName,'r')
        lines = file.readlines()
        file.close()


        for l in lines:
            l = string.strip(l)
            if (not l):
                continue
            else: # (l[0] not in string.digits):
	        items = l.split('\t')
            	(gene, class_label) = (items[0], int(items[1]))
	        if ( gene in self.profileSet.genename):
		    for i in range(len(self.profileSet.genename)):
		   	if(self.profileSet.genename[i] == gene):
			    gene_id = i
			    break
		else:
		    print "not in", gene
#	    else:
#            	items = l.split('\t')
#            	(gene_id, class_label) = (int(items[0])-1, int(items[1])-1)

            # XXX int(items[0])-1 because is later used as array index

#            if class_label > nrModels:
#                print nrModels, class_label            
#                print "ReadPartialAssignment: Error: The number of models should be equal to the number of classes labels!"
#                #"class_label %d is larger than maximal model id" % (class_label, nrModels - 1)
#                return

            
            self.partial_label[gene_id] = class_label
            if self.partials[class_label] == None:
                self.partials[class_label] = [gene_id]
            else:
                self.partials[class_label].append(gene_id)
            if len(items) == 3:
                self.partial_info[gene_id] = items[2]
            #print self.partials

    def ReadConstraints(self, fileName,neg=0):
        """
        Read constraints file
        Still need to do checking (same genes, ...)
        """

        [labels, const] = GQLValidation.readMixture(fileName)
        # XXX - to do - check if labels corresponds ...
        self.constraints[neg] = const        
        

    def computeClustering(self, method=None, nocluster=0, progressBar=None):
        """Currently: Do mixture estimation, cluster assignment after estimation
           terminated
        """
        self.invalidateClustering()        
        models = self.modelList()
        nomodels = self.getNoModels()
        
        #bar = ProgressBarView(root, value=0)
        #bar.pack(fill=X)

##        if method == 'PartiallySupervisedMixture':
##            if self.partial_label != None:
##	    	#self.alpha = None
##                print self.models
##                (self.ml, self.alpha, self.P) = estimate_mixture_partials(models, self.sequenceSet,
##                                                                          self.max_iter, self.eps,
##                                                                          self.fixed_models,
##                                                                          self.partial_label,
##                                                                          progress=progressBar)
##                self.cluster = decode_mixture(self.P, len(models))
##                print "cluster assigment by decode mixture"
##                #print self.cluster
##            else:
##	        showwarning("Message","You need to load the partial labels first")
        if method == 'MixtureExt':
            
            if self.type == 'HMM':
              data = mixtureHMM.SequenceDataSet()          
              data.fromGHMM([],[self.sequenceSet])              
            else:
                data = mixture.DataSet()    
                if self.mixtureModel == None:
                    print 'no models was defined'                
                data.fromList(self.profileSet.profile)

            print data.N, data.p
            #self.mixtureModel = mixture.MMfromLMM(self.mixtureModel)
            (self.ml, self.alpha, self.P) = estimate_mixture_ext(self.mixtureModel, data,
                                                             self.max_iter, self.eps)
#                                                             progress=progressBar)
            
            self.cluster = self.mixtureModel.classify(data)
            print self.cluster
            print "cluster assignment by decode mixture"
                 
        elif method == 'MixtureConstrained':
            prior_type = 0
            if self.prior_neg > 0.0 and self.prior > 0.0:
                prior_type = 3
            elif self.prior_neg > 0.0:
                prior_type = 2
            elif self.prior > 0.0:
                prior_type = 1

            mm = mixture.CMMfromMM(self.mixtureModel)
            self.mixtureModel = mm
            if self.type == 'HMM':
                data = mixtureHMM.SequenceDataSet()          
                data.fromGHMM([],[self.sequenceSet])
            else:
                data = mixture.ConstrainedDataSet()    
                if self.mixtureModel == None:
                    print 'no models was defined'                
                data.fromList(self.profileSet.profile)
            data.setPairwiseConstraints(self.constraints[0],self.constraints[1])
            self.P = Numeric.zeros((len(data),  self.getNoModels()), Numeric.Float)            
            [log_l,log_p] = mm.EM(data,self.max_iter,self.eps, self.prior,self.prior_neg,Numeric.transpose(self.P),prior_type,normaliziering=False)
            self.ml = log_p
            self.alpha = mm.pi.tolist()

            # NOTE: Numeric.exp gives underflow warnings when computing exp
            # for values of  -7.1e2 and smaller. We set them to -Inf manually
            # (exp(-7.1e2) ~ 4.4762862256751298e-309 anyways
            self.P=Numeric.transpose(Numeric.exp(Numeric.where(log_l > -4.8e2, log_l, minus_infinity)))
            
            self.cluster = self.mixtureModel.classify(data,self.prior,self.prior_neg,Numeric.transpose(self.P),prior_type)
            print self.cluster
        elif method == 'PartiallySupervisedMixture':
            if self.partial_label != None:                
              mm = mixture.LMMfromMM(self.mixtureModel)
              self.mixtureModel = mm
              if self.type == 'HMM':
                  data = mixtureHMM.SequenceDataSet()          
                  data.fromGHMM([],[self.sequenceSet])
              else:
                  data = mixture.ConstrainedDataSet()    
                  if self.mixtureModel == None:
                      print 'no models was defined'                
                  data.fromList(self.profileSet.profile)
              data.setConstrainedLabels(self.partials)   
            
              [log_l,log_p] = mm.EM(data,self.max_iter,self.eps)
              self.ml = log_p
              self.alpha = mm.pi.tolist()

              # NOTE: Numeric.exp gives underflow warnings when computing exp
              # for values of  -7.1e2 and smaller. We set them to -Inf manually
              # (exp(-7.1e2) ~ 4.4762862256751298e-309 anyways
              self.P=Numeric.transpose(Numeric.exp(Numeric.where(log_l > -4.8e2, log_l, minus_infinity)))            
              self.cluster = self.mixtureModel.classify(data)

        elif method == 'Clustering':
            (self.ml, self.cluster, self.P) = estimate_clustering(models, self.sequenceSet,
                                                                  self.max_iter, self.eps,
                                                                  self.fixed_models,
                                                                  progress=progressBar)
        elif ( method == 'Hierarchical'):
          import PyclusterInterface
          self.type = 'Hier'
          self.alpha = None
          cluster = PyclusterInterface.hierarchical(self.profileSet,self.no_clusters,dist='e',method='a')
          self.ml = 0.0
          self.cluster = cluster            
          self.P =  GQLValidation.mixtureFromPartition(cluster,self.no_clusters)          
        elif( method == 'Kmeans'):
          import PyclusterInterface
          self.type = 'Kmeans'
          self.alpha = None
          (cluster,error,n) = PyclusterInterface.kmeans(self.profileSet,self.no_clusters,npass=self.no_repetitions,dist='e')
          self.ml = error
          self.cluster =  cluster             
          self.P =  GQLValidation.mixtureFromPartition(cluster,self.no_clusters)
        else:        
          print "ProfileClustering::computeClustering method '%s' not implemented yet" % method
        self.clusters = []
        for i in xrange(self.getNoModels()):
            self.clusters.append([])
        for i, c in enumerate(self.cluster):
            self.clusters[c].append(i)

	self.calculateEntropies()


        #if self.type == 'HMM':
        if self.alpha == None:
            # We dont have alpha, we can use the proportion of sequences assigned to each
            # cluster
            self.alpha = []            
            for i in xrange(self.getNoModels()):
                self.alpha.append(float(len(self.clusters[i])) / len(self.sequenceSet))
            if self.type == 'HMM':    
              # setting the right alphas
              for i,m in enumerate(models):
                m.setPrior(self.alpha[i])

    def computeClusteringView(self, method=None, progressBar=None):
        self.computeClustering(method, progressBar)
        for view in self.views:
            view.new()


    def calculateEntropies(self):
    	self.entropies = []
    	for p in self.P:
		self.entropies.append(Entropy(p))
	self.maxEntropy = max(self.entropies)
        templist = [ (self.entropies[item], item) for item in range(len(self.P))]
        templist.sort()
        
        self.sortedEntropies = [ item[0] for item in templist ]
        #self.sortedEntropiesIndices = [ item[1] for item in templist ]
 

    def clusteringParameters(self, method=None):
        """Return a list of attribute names of ProfileClustering which control
           the clustering method. These attributes have to be set to objects
           of types defined in EditObjectAttributesDialog.py (i.e., ValidatingInt
           instead of int).

           This allows the return value of clusteringParameters to be used as the
           pars argument for the EditObjectAttributesDialog which edits the attributes
           in place.

           E.g., in the calling context
           dummy = EditObjectAttributesDialog(self, self.profileClustering,
                                              self.profileClustering.clusteringParameters())

        """
        self.max_iter = ValidatingInt(20)
        self.eps = ValidatingFloat(0.01)
        self.desc = "Mixture estimation parameters"
        self.use_partial_labels = 1 # XXX make an option
        if method == 'MixtureConstrained':
          if self.constraints[0] == None:   
            self.prior = ValidatingFloat(0)
          else:
            self.prior = ValidatingFloat(1.0)
          if self.constraints[1] == None:   
            self.prior_neg = ValidatingFloat(0)
          else:
            self.prior_neg = ValidatingFloat(1.0)              
          return ['max_iter','eps','prior','prior_neg']
        elif  method == 'Hierarchical':
          self.no_clusters = ValidatingInt(5)  
          return ['no_clusters']
        elif  method == 'Kmeans':
          self.no_clusters = ValidatingInt(5)
          self.no_repetitions = ValidatingInt(15)
          return ['no_clusters','no_repetitions']          
        else:
          return ['max_iter','eps']
      

    
    def estimateFromSequences(self, a):
        
        """Estimate a collection of models from the data using
        pseudo-Bayesian techniques described in MethodPaper and
        Olof Perssons work. Uses module InitialModels.py.
        XXX missing data handling not yet implemented
        """

        linearModels = []
        for i, seq in enumerate(self.sequenceSet):

            linearModels.append(LinearModel(seq))
            #noOfStates = (len(seq) / 2) + 1
            linearModels[-1].fastShrink(a.number_of_states)
            if i % 200 == 0:
                print "%d models shrunken..." % (i)

        collection = InitialCollection(linearModels, a.Method, a.Synchronicity,
                                       a.number_of_clusters)
        collection.hierarchicalCluster(2.0)

        ghmmodels = collection.toGHMMs()

        newModels = []
        for m in ghmmodels:
            if ((not isinstance(m,ghmm.GaussianMixtureHMM)) or
                (not isinstance(m,ghmm.ContinuousMixtureHMM))):
                # hack ... use query to build a model that support missing data
                query = GQLQuery()
                query.loadQuery(m)
                query.newSHMM()
                newModels.append(query.shmm)
            else:
                newModels.append(m)
        self.setModels(newModels)
        self.updateModelsMissingDataEmission()

        self.defaultAlpha = 1.0 / len(self.modelList())
        self.alpha[:] = [self.defaultAlpha]*len(self.modelList())

        # include self.P computation
        self.max_iter=0;
        self.eps=0.01
        self.computeClustering('MixtureExt');

    def estimateFromRandomWeightsExternal(self, zero_one_weights = 0):
        """Method for initializing a mixture model object, and setting its
           parameters from the data given
        """
        
        if self.type == 'HMM':
            data = mixtureHMM.SequenceDataSet()            
            #XXX - this need to be extended for heterogeneous data
            data.fromGHMM([],[self.sequenceSet])                      
        else:
            data = mixture.DataSet()
            if self.mixtureModel == None:
                print 'no models was defined'
            data.fromList(self.profileSet.profile)
        #calling function from mixture package
        print 'vai'
        p = self.mixtureModel.modelInitialization(data,rtype=zero_one_weights, missing_value = MISSING_DATA)
        print 'foi'
        self.P = Numeric.transpose(p)
        
    def estimateFromRandomWeights(self, zero_one_weights = 0, assignment = []):
        """Perform BaumWelch-estimation for each model in ProfileClustering
           based on randomly chosen weights w_{sm} for each sequence s and
           model m. Weights are chosen uniform in [0,1]

           Assuming that the model collection consists of a number of copies of the
           same model, this allows to arrive at distinct models before running the
           mixture.

           If zero_one_weights is not None, then for a fixed s w_{sm} = 1 for
           m = argmax { w_{sl} : models l}. This is equivalent to chosing a model
           m for each sequence s uniformly in [0,#models - 1]

           If assignment is ser, then no randomization is done
        """
        w = Numeric.zeros((len(self.sequenceSet),self.getNoModels()),Numeric.Float)
        models = self.modelList()
        if zero_one_weights:
            trainable_models = range(self.getNoModels())
            for i in self.fixed_models:
                trainable_models.remove(i)

            print "estimateFromRandomWeights: trainable_models = ",  trainable_models

            if assignment == []: # peform random assignment
                assignment = map(lambda x: random.choice(trainable_models),
                                 [None] * len(self.sequenceSet))

            for j, m in enumerate(models):
               # XXX Hack: a fixed_model might have sequences assigned
                if j in self.fixed_models:
                    continue

                for i in xrange(len(self.sequenceSet)):
                    if assignment[i] == j:
                        self.sequenceSet.setWeight(i,1.0)
                        w[i,j] = 1.0
                    else:
                        self.sequenceSet.setWeight(i,0.0)

                print " Reestimating model", j
                m.baumWelch(self.sequenceSet, 10, 0.5)
        else:
            for j, m in enumerate(models):
                # XXX Hack: a fixed_model might have sequences assigned
                if j in self.fixed_models:
                    continue
                for i in xrange(len(self.sequenceSet)):
                    aux  = random.uniform(0,1)
                    w[i,j] = aux
                    self.sequenceSet.setWeight(i,aux)
                print " Reestimating model", j
                m.baumWelch(self.sequenceSet, 10, 0.5)
        self.P = w
        return w

    def estimateFromPartialAssignment(self):
        models = self.modelList()
        for j, m in enumerate(models):
            if self.partials[j] == None:
                continue
            print " Reestimating model", j, self.partials[j]
            for i in xrange(len(self.sequenceSet)):
                if i in  self.partials[j]:
                    self.sequenceSet.setWeight(i,1.0)
                else:
                    self.sequenceSet.setWeight(i,0.0)
            #steps = min(10, 2 + len(self.partials[j]))
            m.baumWelch(self.sequenceSet, 10, 0.01)

##    def pathGrouping(self, paths, states):
##	""" computes a list of lists containing the indices
##	corresponding to the grouping resulting from a sorting
##	of the paths with respect to the passed states"""
##	groupedIndices = []
##	allinfo = []
##	for i,path in enumerate(paths):

##		seqinfo = []
##		for x in states:
##			seqstateinfo = []
##			for k in range(len(path)):
##				if path[k] == x:
##					seqstateinfo.append(k)
##			seqinfo.append(seqstateinfo)

##		seqinfo.append(i)

##		allinfo.append(seqinfo)

##	allinfo.sort()

##	#    print allinfo

##	oldinfo = []
##	for x in allinfo:

##		#print oldinfo, x[:-1], x[-1]
##		if x[:-1] != oldinfo:
##			#print "Empty list appended."
##			groupedIndices.append([])
##		oldinfo = x[:-1]

##		groupedIndices[-1].append(x[-1])

##	return groupedIndices


    def data(self):
        return self.profileSet

    def nrClusters(self):
        return len(self.clusters)

    def __getitem__(self,i):
        """Allow iteration over clusters!"""
        if self.cluster == None:
            raise IndexError
        else:
            return self.clusters[i]

    def normalizeAlphas(self):
        norm = sum(self.alpha)
        if norm != 0:
	    for i in range(self.getNoModels()):
		self.alpha[i] = self.alpha[i]/norm

    def invalidateClustering(self):
        """Call when we change models, profiles, ..."""
        self.cluster = None
        self.clusters = None
        for view in self.views:
            view.clear()
	self.defaultAlpha = 0
	self.entropies = [] # entropies of a sequence given the model assignemt probability
	self.listeners = []

    def addView(self, view):
        """A view has to understand: new(), update() and clear()
           new:    a new clustering, with a new number of clusters
           update: cluster membership might have changed
           clear:  the clustering displayed became invalid

        """
        self.views.append(view)

    def removeView(self, view):
        self.views.remove(view)

    def modelIDs(self):
        return self.models.keys()

    def nextModelID(self):
        """Needed when we add/delete models"""
        if self.models != {}:
            return max(self.models.keys()) + 1
        else:
            return 0

    def addModel(self, nr_states, d, randomize = 0, mu = 0.0, sigma = 0.1, cyclic = 0, fix=0, dimension=1,noise=0.0):
        """Add a linear model of nr_states states with P[selfloop] = 0.1, Gaussian
           emission N(0.0, 0.2) and an extra state at the end producing the end
           symbol (encoded as 9999.99).

           Use random values if randomize = 1
        """
        print "noise", noise
        if self.type != 'HMM':
            print 'You should not add a HMM to another model type'
        else:
            if cyclic:
                d = d/2
            average_d = float(d) / nr_states
            p = 1.0 - 1.0/ average_d

            #
            self.invalidateClustering()
            id = self.nextModelID()
            N = nr_states+1 # Correct for 'end' state
            # Build the transition matrix
            A = Numeric.zeros((N,N),Numeric.Float)
            for i in range(N - 1):
                if randomize:
                    d = max(1.01, random.gauss(average_d, 1.0))
                    A[i,i] = 1.0 - 1.0/ d
                else:
                    A[i,i] = p
                A[i,i+1] = 1.0 - A[i,i]
            A[N-1,N-1] = 1.0

            if cyclic:
                A[N-2,0] =  A[N-2,N-1]/2
                #A[N-2,N-2]= A[N-2,N-2]/2
                A[N-2,N-1] = A[N-2,N-1]/2

            if dimension > 1: # use multivariate hmms                
              #B = Numeric.zeros((N,3,2), Numeric.Float)
              B = [] 
              for i in range(N - 1):
                Baux = []
#                if randomize:
#                    [mu + random.uniform(-2,2) for j in range(dimension)]
#                    Baux = [ [mu + random.uniform(-2,2) for j in range(dimension)],  
#                               Numeric.reshape(Numeric.diag([sigma * random.uniform(.5,3) for j in 
#range(dimension)]),(1,dimension*dimension)).tolist()[0] ,
#                             [-9999.99 for j in range(dimension)],
#                              Numeric.reshape(Numeric.diag([0.01 for j in 
#range(dimension)]),(1,dimension*dimension)).tolist()[0] ,
#                             [1-self.missingRate,self.missingRate]]
#                else:
#                     Baux = [ [mu for j in range(dimension)],
#                             Numeric.reshape(Numeric.diag([sigma for j in 
#range(dimension)]),(1,dimension*dimension)).tolist()[0] ,
#                             [-9999.99 for j in range(dimension)],
#                             Numeric.reshape(Numeric.diag([0.01 for j in 
#range(dimension)]),(1,dimension*dimension)).tolist()[0] ,
#                             [1-self.missingRate,self.missingRate]]
                if not isinstance(mu, list):   
                    mu1 = [mu for j in range(dimension)]
                else:
                    mu1 = mu
                if randomize:
                    mu1 = [m + random.uniform(-2,2) for m in mu1]
                    sig1 = Numeric.reshape(Numeric.diag([sigma * random.uniform(.5,3) for j in range(dimension)]),(1,dimension*dimension)).tolist()[0]
                    mu2 = [-9999.99 for j in range(dimension)]
                    sig2 =  Numeric.reshape(Numeric.diag([0.01 for j in range(dimension)]),(1,dimension*dimension)).tolist()[0]
                else:
                    sig1 =  Numeric.reshape(Numeric.diag([sigma for j in range(dimension)]),(1,dimension*dimension)).tolist()[0]
                    mu2 =   [-9999.99 for j in range(dimension)]
                    sig2 =  Numeric.reshape(Numeric.diag([0.01 for j in range(dimension)]),(1,dimension*dimension)).tolist()[0]
                                        
                if noise > 0:
                    print "addicionou o noise"
                    mu3 =   mu1
                    sig3 =  Numeric.reshape(Numeric.diag([5.0 for j in range(dimension)]),(1,dimension*dimension)).tolist()[0]                  
                    Baux = [mu1,sig1,mu2,sig2,mu3,sig3,
                             [1-noise-self.missingRate,self.missingRate,noise]]
                else:
                    Baux = [mu1,sig1,mu2,sig2,[1-self.missingRate,self.missingRate]]
                #print Baux
                B.append(Baux)

              Baux = ([   [9999.99 for j in range(dimension)],
                           Numeric.reshape(Numeric.diag([0.01 for j in range(dimension)]),(1,dimension*dimension)).tolist()[0] ,
                         [-9999.99 for j in range(dimension)],
                           Numeric.reshape(Numeric.diag([0.01 for j in range(dimension)]),(1,dimension*dimension)).tolist()[0] ,
                           [1,0]])

            
              #Baux = [ [9999.99], [-9999.99], [1,0] ]
              B.append(Baux)
              A[N-2,N-1] = 0.0
              A[N-2,N-2] = 1.0

              #Baux.append([9999.99,-9999.99] # END symbol, only produced in last state
              #B[N-1,1] = [0.01,0.0001]
              #B[N-1,2] = [1,0]

              
            else:
              if noise > 0:                  
                B = Numeric.zeros((N,3,3), Numeric.Float)
                for i in range(N - 1):
                  if randomize:
                    B[i][0] = [mu + random.uniform(-2,2),-9999.99,mu] #-9999.99 is the missing data value
                    B[i][1] = [sigma * random.uniform(.5,3),0.01,5.0]
                    B[i][2] = [1-self.missingRate-noise,self.missingRate,noise]
                  else:
                    print mu
                    B[i,0] = [mu,-9999.99,mu]
                    B[i,1] = [sigma,0.0001,5.0]
                    B[i][2] = [1-self.missingRate-noise,self.missingRate,noise]

                B[N-1,0] = [9999.99,-9999.99, 0] # END symbol, only produced in last state
                B[N-1,1] = [0.01,0.0001, 1]
                B[N-1,2] = [1,0,0]
              else:
                B = Numeric.zeros((N,3,2), Numeric.Float)
                for i in range(N - 1):
                  if randomize:
                    B[i][0] = [mu + random.uniform(-2,2),-9999.99] #-9999.99 is the missing data value
                    B[i][1] = [sigma * random.uniform(.5,3),0.01]
                    B[i][2] = [1-self.missingRate,self.missingRate]
                  else:
                    B[i,0] = [mu,-9999.99]
                    B[i,1] = [sigma,0.0001]
                    B[i][2] = [1-self.missingRate,self.missingRate]

                B[N-1,0] = [9999.99,-9999.99] # END symbol, only produced in last state
                B[N-1,1] = [0.01,0.0001]
                B[N-1,2] = [1,0]
              
            pi = Numeric.zeros(N, Numeric.Float)
            pi[0] = 1.0

            if dimension > 1:
              hmm = ghmm.HMMFromMatrices(ghmm.Float(),ghmm.MultivariateGaussianDistribution(ghmm.Float), A, B, pi)
            else:
              hmm = ghmm.HMMFromMatrices(ghmm.Float(),ghmm.GaussianMixtureDistribution(ghmm.Float), A, B, pi)
            # XXX Fix 'end' state to prevent re-estimation in Baum-Welch
            # hmm.fix(N-1,hmm.kFixTransitions && kFixEmissions)

            # fix the missing symbol emissions
            for i in range(N):
               if noise > 0:
                 hmm.setMixtureFix(i,[0,1,1])
               else:
                 hmm.setMixtureFix(i,[0,1])
            # fixing also the terminating state from estimation
            hmm.setStateFix(N-1,1)
            #print "is fixed?", N-1, hmm.getStateFix(N-1)
            if fix ==1:
              for i in range(N):
                hmm.setStateFix(i,1)

            self.alpha.append(self.defaultAlpha)
            self.models[id] = hmm
            print "# Added %d-state model with id=%s" % (nr_states, id)
        return id

    def deleteModel(self, id):
      if self.type != 'HMM':
        print 'You should not add a HMM to another model type'
      else:       
        if id in self.fixed_models:
            self.fixed_models.remove(id)
        del(self.models[id])
	del self.alpha[id]
        self.invalidateClustering()

    def fixModel(self,id):
        """A fixed model is excluded from re-estimation in the clustering"""
        if not id in self.fixed_models:
            self.fixed_models.append(id)

    def unfixModel(self, id):
        if id in self.fixed_models:
            self.fixed_models.remove(id)

    def modelIsFixed(self,id):
        return id in self.fixed_models

    def writeSeqMixDistributions(self, fileName):
        if self.cluster == None:
            print "# ProfileClustering.SeqMixDistribution: Don't have a clustering yet"
            return
        file = open(fileName, 'w')
        #file.write("# Clustering of %d seqs from %s\n" % (len(self.profileSet),
        #                                                  self.profileSet.fileName))

        #file.write("# Using %d models from %s\n" % (self.getNoModels(),
        #                                            self.modelsFileName))
        file.write("# id\tacc\tCluster\tEntropy\tP[cluster0|seq]...P[clusterN|seq]\n")

        for i in xrange(len(self.profileSet)):
            file.write("%s\t%s\t" %(self.profileSet.genename[i], self.profileSet.acc[i]))
                                            #(self.cluster[i]+1),self.entropies[i]))
            for j in range(len(self.clusters)):
                if j < len(self.clusters) - 1:
                    file.write("%1.4f\t" % (self.P[i,j]))
                else:
                    file.write("%1.4f\n" % (self.P[i,j]))
        file.close()


    def readMixtureDistributions(self, fileName):
        file = open(fileName,'r')
        lines = file.readlines()
        file.close()

        ids = []
        mix = []

        for l in lines[1:]:
            l = string.strip(l,'\r\n')
            items = l.split('\t')
            print items
            ids.append(l[0])
            aux = []
            
            for i in items[2:]:
                aux.append(float(i))

            mix.append(aux)

        #assert len(mix) == len(self.profileSet)
        self.P = mix
        self.no_clusters = len(mix[0])


    def writeClusterAssignment(self, fileName, allPosteriors=None):
        if self.cluster == None:
            print "# ProfileClustering.writeClusterAssignment: Don't have a clustering yet"
            return
        file = open(fileName, 'w')
        file.write("# Clustering of %d seqs from %s\n" % (len(self.profileSet),
                                                          self.profileSet.fileName))
        file.write("# Using %d models from %s\n" % (self.getNoModels(),
                                                    self.modelsFileName))
        file.write("# id\tacc\tcluster\tP[cluster|seq]\n")
        for i, c in enumerate(self.clusters):
            file.write("# Cluster %d of size %d\n" % (i,len(c)))
            for j in c:
                # XXX hack for tonight: write genename instead of current number
		if allPosteriors is not None:
			posteriors = "\t".join(["%1.4f" % self.P[j,k] for k in xrange(len(self.clusters))])
                	file.write("%s\t%s\t%d\t%s\n" % (self.profileSet.genename[j], self.profileSet.acc[j],
                                                	    i, posteriors))	
		else:
                	file.write("%s\t%s\t%d\t%1.4f\n" % (self.profileSet.genename[j], self.profileSet.acc[j],
                                                	    i, self.P[j,i]))
        file.close()

    def writeClusteredDataCAGED(self, fileName, entropy=0, labels=[], dim=1, variate=[]):        
    	''' A method for writing a CAGED file, with the clustering assingment in the second column'''

        if (labels == []):
          labels = self.cluster
          if self.cluster == None:
              print "# ProfileClustering.writeClusterAssignment: Don't have a clustering yet"
              labels = self.profileSet.info
              print labels
            
        file = open(fileName, 'w')

        if entropy:
            (genes,hide) = self.EntropyCutoff(self.cutoff)
        else:
            genes =  range(len(self.profileSet))
        
        size = len(self.profileSet.profile[0])

        if len(variate) > 0:
          indices = []
          for v in variate:
              indices = indices+range(v,size,dim)
          indices.sort()
        else:
          indices = range(size)

        print 'indices',indices

        for i in genes:
            file.write("%s\t%s" % (self.profileSet.genename[i].strip('"'), labels[i]))
	    for j in indices:
	    	file.write("\t%2.4f" % self.profileSet.profile[i][j])
            file.write("\n")
        file.close()

    def setEntropyCutoff(self,event,cutoff):
        self.cutoff=cutoff
        [showlist,hidelist]= self.EntropyCutoff(float(cutoff))
	self.updateResult(showlist,hidelist)
	return showlist, hidelist

    def EntropyCutoff(self,cutoff):
        showlist = []
	hidelist = []
    	for i, entropy in enumerate(self.entropies):
		if (entropy < cutoff):
			showlist.append(i)
		else:
			hidelist.append(i)
	return showlist, hidelist
    
    def getClustersAfterCuttoff(self,cutoff):
        [showlist,hidelist]= self.EntropyCutoff(float(cutoff))
        newClusters = []
        for c in self.clusters:
            newCluster = []
            for e in c:
                if( e in showlist):
                    newCluster.append(e)
            newClusters.append(newCluster)
        return newClusters


    def updateResult(self, showProfiles, hideProfiles):
        for l in self.listeners:
            l.update(showProfiles, hideProfiles)

    def addListener(self, listener):
        self.listeners.append(listener)

    def removeListener(self, listener):
        self.listeners.remove(listener)


def modelAsString(m):
    """This assumes linear models: Produces a two-line string"""
    result = ""
    duration = []
    for i in range(m.N-1):
        if isinstance(m,ghmm.ContinuousMixtureHMM):
            (t, mu, sigma, weight) = m.getEmission(i,0)
        elif isinstance(m,ghmm.GaussianMixtureHMM):
            (mu, sigma,weight) = m.getEmission(i,0)        
        else:
            (mu, sigma) = m.getEmission(i)
        #result += "%1.2f +/- %1.2f\t" % (mu, math.sqrt(sigma))
        result += "%1.2f +/- %1.2f\t" % (mu, sigma)
        a = min(0.999, m.getTransition(i, i)) # a = 1.0 implies infinite duration
        duration.append(1.0 / (1.0 - a))
    result += "\n"
    result += string.join(map(lambda x: "%2.2f\t" % x,  duration),'\t')
    return result + "\n"


if __name__ == '__main__':
    import Tkinter
    import GQLClusterApp
    random.seed() # Timeseed Python RNG
    root = Tkinter.Tk()
    GQLClusterApp.GQLClusterApp(root)
    root.mainloop()


