#!/usr/bin/env python2.3

usage_info="""
Generate a sequence set from given models.

Usage: estimation_with_replications.py [options] <input_data_file.txt>
Options:
    -h          Print this help message.
    -c number   Number of Components (default 1)
    -e C|M      Type of estimation (default M)
                C  - clustering
		CM - constrained mixture
                LM - labeled mixture
		M  - mixture (default)
		K  - k-means
    -E          Calculate cluster validation statistics from data labels
                If labels are given in the second column of the dataset
    -i R|K|M|L  Initialization method
                R - random initialization (default)
		K - k-means initialization
		L1 - with labels (equal sizes)
                L2 - with labels (sampled)
                L3 - labels from file
		M <model_file.smo> - given model initialization
    -I          Identifier for repetitions (default 0)
    -l number   Percentage of labels to be use (number/1000)
                Only used in the case that original labels are used
    -D number   Dimension of multivariates
    -L file     Label file
                if not is included, all  labels from the original data set are used
    -m file     Model file (only with conjuntion with -i M)
    -n          Output statistics for number finding the number of components (AIC,BIC,IBIC)
    -p float    Prior for positive constraints learning (should be used with -t)
    -P float    Prior for negative constraints learning (should be used with -T)    
    -r number   Number of Repetitions (default 30)
    -s number   Number of states in the random models (default 2)
                If states is equal to data lengh, mixture of gaussians are used.
    -S          Seed for random initializations (default 0)
    -o file     Initial name scheme for all output file
    -t file     Positive Constraints file
    -T file     Negative Constraints file
    -V number   Peform cross validation (number/100) (not fully implemented)


Example:  python estimation_with_replications.py -c 2 -s 2 -i L3 -e LM -r 1 -D 70 -o output -L const/baranzini-fold-all.txt-cv-2-1.lab -M 0  baranzini-fold-all.txt
          python estimation_with_replications.py -no -c 3 -s 4 -e CM -i R -r 8 -P 5000 -D 70 -o res-cons-1-2 -T const/baranzini-fold-all.txt-cv-1-2-neg.const -M 20 -L const/baranzini-fold-all.txt-cv-1-2.lab baranzini-fold-all.txt

          

Notes:
 * Requires the ghmm and mixture.py modules 
 * If the sequence file already exists, the new sequence set is appended.
"""

import GQLMixture
import GQLCluster
import GQLValidation
import ViterbiDecomposition
import numpy
import getopt
import feature_selection
from random import *
from GQLEvaluation import *
#import matplotlib.mlab as MLab
import ghmm
from GO.GODag import *
from PPI.PPI import *
import copy
import mixture
import mixtureHMM


def constraintDistribution(cluster,constraints,no):
  resw = numpy.zeros(no,numpy.float)
  resb = numpy.zeros((no,no),numpy.float)
  resb2 =  numpy.zeros((no,2),numpy.float)
  count = numpy.zeros(no,numpy.float)
  count2 =  numpy.zeros(no,numpy.float)
  size = len(cluster)

  #print cluster
  #print constraints
  
  for i in range(size):
    if sum(constraints[i]) > 0.0:
      count[cluster[i]] += 1
      
    for j in range(size):
        if i != j:
	  resb[cluster[i]][cluster[j]] += constraints[i][j]
          if cluster[i] != cluster[j]:
            resb2[cluster[i]][1] += constraints[i][j]
          else:
            resb2[cluster[i]][0] += constraints[i][j]
        else:
          resw[cluster[i]] += 1
          

  sumc = numpy.sum(resb,axis=1)
  suml = numpy.sum(resb,axis=0) + 0.00001
  aux = [list(resw)]
  aux.append(list(size-resw))
  resw2 = numpy.array(aux,numpy.float)
  aux = resb*sumc/(size*size)
  neg_resw = numpy.transpose([numpy.diagonal(aux),numpy.sum(aux-numpy.eye(no,no)*numpy.diagonal(aux),axis=0)])


  return resb,numpy.transpose(resb/suml),(numpy.transpose(numpy.transpose(resb)/resw)/resw)/sum(sum(numpy.transpose(numpy.transpose(resb)/resw)/resw)), resb2, numpy.transpose(numpy.transpose(resb2)/suml), numpy.transpose((numpy.transpose(resb2)/(resw2))), neg_resw


def joinDouble(mat,separator):
  res = str(mat[0])
  for i in mat[1:]:
    res = res + separator + str(i)
  return res

def printTable(table):
        s = ""
	saux = ""
	for j in range(len(contigencyTable[0])):
		s += "%4d" % (j+1)
		saux+="----"
	s += " \n"+saux+"\n"

	for i in range(len(table)):
		s += "%4d |" % (i+1)
		for j in range(len(contigencyTable[i])):
			s += "%4d" % contigencyTable[i,j]
		s +="\n"
	return s

class Usage(Exception):
    def __init__(self, msg):
        self.msg = msg

class random_models_args:
	def __init__(self):
		self.number_of_states = nr_states
		self.number_of_models = num_clusters
		self.total_duration = duration
		self.parameters = 1
		self.default_variance = 3.0
		self.default_mean = 5.0
		self.noiseModel = 0
                self.dimension = 10
                self.parameters=0
		


def set_random_partial_labels_equal(profileClustering,percentageOfLabels,labelsClasses):
   profileClustering.partials = []
   profileClustering.partial_label = {}

   no_classes = len(labelsClasses)
   no_labels = len(profileClustering.profileSet)*percentageOfLabels
   k = int(no_labels/no_classes)
   if k == 0:
       k = 1

   print no_classes, no_labels, percentageOfLabels,k

   for i,c in enumerate(labelsClasses):
       selected = sample(range(max(k,len(c))),k)
       profileClustering.partials.append([])
       for s in selected:
           profileClustering.partials[i].append(c[s])

       for s in selected:
           profileClustering.partial_label[c[s]] = i
   print "labels", profileClustering.partials

def set_random_partial_labels(profileClustering,percentageOfLabels,labelsClasses):
   profileClustering.partials = []
   profileClustering.partial_label = {}

   for i,c in enumerate(labelsClasses):
       print len(c),percentageOfLabels
       k = int(len(c)*percentageOfLabels)
       if k == 0:
            k = 1
       print c,k
       selected = sample(range(len(c)),k)
       profileClustering.partials.append([])
       for s in selected:
           profileClustering.partials[i].append(c[s])

       for s in selected:
           profileClustering.partial_label[c[s]] = i
   print "labels", profileClustering.partials


def replications(profileSet, model,repetitions, initializationMethod,estimationMethod,inputmodel,perlabels,labelFile,
		 constr,neg_constraints,prior_weight,prior_neg_weight,prior_type,maxIter):
        currentProfile = None
	profiles = []

        print 'replications'

	if(initializationMethod in ('L1','L2')):
	    labels = profileSet.seq_classes
    	    labelsClasses = []
            for i in range(max(labels)):
                labelsClasses.append([])
            for i,l in enumerate(labels):
                labelsClasses[l-1].append(i)

        count = repetitions
        j = 0
	#maxError = 20
	#countError = 0
        while j < count:
  	    try:
		alpha = []


		currentProfile = GQLCluster.ProfileClustering()
		currentProfile.setProfileSet(profileSet)

                print 'exemplos', len(profileSet)

                currentProfile.max_iter=1


                print 'before initialization'

                if estimationMethod in ('LM','C','CM','M'):
                  if(initializationMethod == 'R'):			
                      currentProfile.randomModels(model,noise=0.05)
                      #w = currentProfile.estimateFromRandomWeightsExternal()
                      currentProfile.estimateFromRandomWeights()
                  elif(initializationMethod == 'K'):
                       # estimate from kmeans results
                      currentProfile.randomModels(model)
                  elif(initializationMethod in ('L1','L2','L3')):
                      print 'init', initializationMethod
                      currentProfile.randomModels(model,noise=0.05)
                      print currentProfile.models
                      if(initializationMethod == 'L1'):
                          set_random_partial_labels(currentProfile,perlabels,labelsClasses)
                      elif(initializationMethod == 'L2'):
                          set_random_partial_labels_equal(currentProfile,perlabels,labelsClasses)
                      elif(initializationMethod == 'L3'):
                          currentProfile.ReadPartialAssignment(labelFile)
                      currentProfile.estimateFromPartialAssignment() # estimate from random assigments
                  else:
                      currentProfile.readModels(inputmodel,'xml')

                print 'after'

                currentProfile.max_iter=maxIter
                currentProfile.eps=0.1
               

                if(estimationMethod == 'LM'):  
		    currentProfile.computeClustering('PartiallySupervisedMixture')
		elif(estimationMethod == 'M'):
		    currentProfile.computeClustering('MixtureExt')		
		elif(estimationMethod == 'C'):
		    [currentProfile.ml,
		     currentProfile.cluster,
		      currentProfile.p] = GQLMixture.estimate_clustering(currentProfile.modelList(),
		      							profileSet.ghmm_seqs,
                                                                        10, 0.01,currentProfile.fixed_models)
		elif(estimationMethod == 'CM'):
		    currentProfile.prior =  prior_weight
		    currentProfile.prior_neg =  prior_neg_weight
		    currentProfile.constraints = [constraints,neg_constraints]
		    currentProfile.computeClustering('MixtureConstrained')
		    
		elif(estimationMethod == 'K'):
                    currentProfile.no_clusters=model.number_of_models
                    currentProfile.no_repetitions=1
                    currentProfile.computeClustering('Kmeans')
 		elif(estimationMethod == 'H'):
                    currentProfile.no_clusters=model.number_of_models
                    currentProfile.no_repetitions=1
                    currentProfile.computeClustering('Hierarchical')
                    print currentProfile.P
                    
##		elif(estimationMethod == 'M'):
##  		    [currentProfile.ml,
##		     currentProfile.alpha,
##		     currentProfile.p] =    GQLMixture.estimate_mixture(currentProfile.modelList(),

##			    					        profileSet.ghmm_seqs, 10, 0.01, 					 			                        	currentProfile.fixed_models)
##	            currentProfile.cluster = GQLMixture.decode_mixture(currentProfile.p, num_clusters)
##                elif(estimationMethod == 'CMO'):		

##                    if prior_type == 1:
##    		      [currentProfile.ml,
##		       currentProfile.alpha,
##		       currentProfile.p] =    GQLMixture.estimate_mixture(currentProfile.modelList(),
##			      					        profileSet.ghmm_seqs, 10, 0.01, 					 			                        	currentProfile.fixed_models,
##									previous_weights=w,
##									constraints=constr,prior_weight=prior_weight,
##									prior_type=prior_type)
##                    if prior_type == 2:
##    		      [currentProfile.ml,
##		       currentProfile.alpha,
##		       currentProfile.p] =    GQLMixture.estimate_mixture(currentProfile.modelList(),
##			      					        profileSet.ghmm_seqs, 10, 0.01, 					 			                        	currentProfile.fixed_models,
##									previous_weights=w,
##									neg_constraints=neg_constraints,
##									prior_neg_weight=prior_neg_weight,
##									prior_type=prior_type)
##                    if prior_type == 3:
##    		      [currentProfile.ml,
##		       currentProfile.alpha,
##		       currentProfile.p] =    GQLMixture.estimate_mixture(currentProfile.modelList(),
##			      					        profileSet.ghmm_seqs, 10, 0.01, 					 			                        	currentProfile.fixed_models,
##									previous_weights=w,
##									constraints=constr,
##									prior_weight=prior_weight,
##									neg_constraints=neg_constraints,									                        prior_neg_weight=prior_neg_weight,		    
##									prior_type=prior_type)
##                      currentProfile.cluster = GQLMixture.decode_mixture(currentProfile.p, num_clusters)		      
##		if(estimationMethod == 'LMO'):
##  		    [currentProfile.ml,
##		     currentProfile.alpha,
##		     currentProfile.p] =    GQLMixture.estimate_mixture_partials(currentProfile.modelList(),
##			    					                 profileSet.ghmm_seqs,
##                                                                                 10, 0.01,currentProfile.fixed_models,
##									         currentProfile.partial_label)
##	            currentProfile.cluster = GQLMixture.decode_mixture(currentProfile.p, num_clusters)

                currentProfile.clusters = []
                for i in xrange(num_clusters):
                   currentProfile.clusters.append([])
                for i, c in enumerate(currentProfile.cluster):
                    currentProfile.clusters[c].append(i)
		profiles.append(currentProfile)
	        j += 1
            except OverflowError:
	        print "repetition failed %i"%j
		print OverflowError
            except mixture.InvalidPosteriorDistribution:
	        print "repetition failed %i"%j
                j += 1
            except mixture.ConvergenceFailureEM:
	        print "repetition failed %i"%j
                j += 1             
	return (profiles)

def resultsEvaluation(clusteringProfile,viterbiGroups, filter=[]):

        #do viterbi
        if( viterbiGroups != 0):
	    cluster = clusteringProfile.cluster
            no_clusters = 0
            for j in range(num_clusters):
              i = 0
	      cluster_aux = GQLCluster.ProfileSubSet(clusteringProfile.profileSet,clusteringProfile.clusters[j])
  	      vitDecomp = ViterbiDecomposition.ViterbiDecomposition(cluster_aux.getSequenceSet(),
								    clusteringProfile.modelList()[j])
              grouping = vitDecomp.computeFixedNumberDecomposition(viterbiGroups)
  	      i = 0

  	      for group in grouping:
                  for element in group:
                      cluster[cluster_aux.ids[element]] = no_clusters + i
                  i += 1

	      no_clusters =+i

	    clusteringProfile.cluster = cluster
            
        #print filter

        if len(filter) == 0:
          filter = range(len(clusteringProfile.profileSet))

        #print 'filter', filter
        (contigencyTableAll, rand, spe,
         sen ,cr, error,pp,np)= GQLValidation.computeExternalIndices(clusteringProfile.profileSet.seq_classes,
								     clusteringProfile.cluster,
	                                                             clusteringProfile.profileSet.classes_no,
                                                                     max(clusteringProfile.cluster)+1)
        print contigencyTableAll
        
        print 'here', clusteringProfile.cluster, clusteringProfile.profileSet.seq_classes
        classes = numpy.take(clusteringProfile.profileSet.seq_classes,filter)
        cluster = numpy.take(clusteringProfile.cluster,filter)

        print 'here', cluster, classes
	(contigencyTable, rand, spe,
         sen ,cr, error,pp,np)= GQLValidation.computeExternalIndices(classes,
								     cluster,
	                                                             clusteringProfile.profileSet.classes_no,
                                                                     max(clusteringProfile.cluster)+1)
	print contigencyTable        
	return (rand, spe, sen, contigencyTable, error, pp, np,contigencyTableAll)


def loadEvalData(data,profile,level,ont):	
    if( data == 'GO' or data == 'GOSlim' or data=='GOAll'):
        go = GODag()
        go.loadTermsFromFile('../../GO/gene_ontology.obo')
        go.loadGenesFromFile('../../GO/gene_association.sgd')
        if( data == 'GOSlim' ):
            go = go.getGOSlim('goslim_yeast')
	
	if( data == 'GO' or data == 'GOSlim'):
	  (mixtureAno,mask,labelnames) = mixtureFromGoMapping(go,profile.genename,level,ont)
	elif (data == 'GOAll'):
	  (mixtureAno,mask,labelnames) = mixtureFromGoMapping(go,profile.genename,level,ont,all=1,max=300,min=2)    
    elif (data == 'PPI'):
        ppi = PPIGraph()
	ppi.loadPPIFromFile('../../PPI/identifications.txt')
	(mixtureAno,mask, labelnames) = mixtureFromPPI(ppi.baitTargets,profile.genename)
    elif (data == 'RRR'):
        ppi = PPIGraph()
	ppi.loadPPIFromTabFile('../../PPI/regulee-tor.txt')
	(mixtureAno,mask, labelnames) = mixtureFromPPI(ppi.baitTargets,profile.genename)
    return mixtureAno, mask 	



if __name__ == '__main__':

        import sys

	# default settings
        noOfClusters = 1
        fileName = ""	
        nr_states = "[17]"
        viterbiGroups = 0
        repetitions = 30
        initializationMethod = 'R'
        estimationMethod = 'M'	
        inputModel = ""
	labels = 0
	noOfClusters = 0
	sd=0	
	outputFile = ""
	constraintsFile = ""
	prior_type=0
	prior_weight = 0.0
	prior_neg_weight = 0.0
	crossvalidate = 0.0
	negconstraintsFile = ''
	id = 0
	data = ''
	eval = 1
        labelFile = ''
        maxIter=10
        dimension=1
        onlyLabels=1
        filterGenes=0

        argv = sys.argv
        print argv	

        try:
            opts, args = getopt.getopt(argv[1:], 'h:c:e:i:I:l:L:m:M:O:o:r:S:v:n:s:p:P:t:T:V:d:P:D:')
        except getopt.error, msg:
	     print usage_info
             raise Usage(msg)

        # process options
        for o, a in opts:
            if o in ('-h'):
                print __doc__
                exit()
            if o in ('-c'): 
                # Number of components
                num_clusters = int(a)
	    if o in ('-e'):
    	        estimationMethod = a    
            if o in ('-E'):
                # Perform evaluation              
		eval = int(a)		
            if o in ('-r'):
                # Nunber of repetitions
                repetitions = int(a)
            if o in ('-M'):
                # Model file as input
                maxIter = int(a)               
            if o in ('-m'):
                # Model file as input
                inputModel = a
            if o in ('-o'):
                # File name for outputs
                outputFile = a
            if o in ('-O'):
                # File name for outputs
                onlyLabels = int(a)
	    if o in ('-t'):
		#positive constraints file
		print 't'
		constraintsFile = a
	    if o in ('-T'):
	        print '-T'
	        # negative constraints file
		negconstraintsFile = a
	    if o in ('-n'):
		# print statitics for end evaluation
		noOfClusters = 1		    
	    if o in ('-p'):
		# prior weight
		prior_weight = float(a)/1000
	    if o in ('-P'):
		# prior weight
		prior_neg_weight = float(a)/1000
            if o in ('-s'):
                # Number of states
                int(a)
		nr_states = "["+a+"]"
            if o in ('-S'):
                # Seed              
		sd = int(a)		
	    if o in ('-V'):	        
		crossvalidate = float(a)/100
	    if o in ('-I'):	        
		id = float(a)
	    if o in ('-i'):	        
		initializationMethod = a		
	    if o in ('-d'):
		data = a
            if o in ('-L'):
                labelFile = a
            if o in ('-D'):
		dimension = int(a)
		
	fileName = argv[-1]
        if fileName == "":
            raise Usage('No input file specified')

        if(initializationMethod == 'M'):
            repetitions = 1
	    if (inputModel == ""):
    	        raise Usage('option -i M requires the definition of a initial model')

        if(initializationMethod == 'L3'):
            repetitions = 1

	if(initializationMethod in ('L1','L2') ):
	    labels = labels/100
	    if (labels == 0):
    	        raise Usage('option -i L requires the definition of the percentage of labels')
		
        if outputFile == "":
	   outputFile = fileName
	   
	neg_constraints = None
	constraints = None

	if negconstraintsFile != '' and constraintsFile != '':
	  prior_type =3
	elif constraintsFile != '':
	  prior_type =1
	elif negconstraintsFile != '':
	  prior_type =2
	  if prior_neg_weight == 0.0:
	    prior_neg_weight = prior_weight
	    prior_weight = 0
	 
	if prior_type == 1 or prior_type == 3:
	    [ids,constraints] = GQLValidation.readMixture(constraintsFile)	
	if prior_type == 2 or prior_type == 3:
	    [ids,neg_constraints] = GQLValidation.readMixture(negconstraintsFile)
	if prior_type == 3:
	    sump = numpy.sum(numpy.sum(constraints))
	    sumn = numpy.sum(numpy.sum(neg_constraints))
	    if prior_neg_weight == 0.0:
 	      prior_neg_weight = (prior_weight*sump)/(2*(sumn+sump))
	      prior_weight = (prior_weight*sumn)/(sumn+sump)

	print 'priors', prior_type, prior_neg_weight, prior_weight, negconstraintsFile, constraintsFile
	    
	if sd != 0:
	  seed(sd)

        profileSet = GQLCluster.ProfileSet()
	if "sqd" in fileName:
	    sequenceSet = ghmm.SequenceSet(ghmm.Float(),fileName)
            profileSet.ReadDataFromDSequences(sequenceSet,fileName)
	else:
            profileSet.ReadDataFromCaged(fileName)

        if onlyLabels == 1:
           auxProfile = GQLCluster.ProfileClustering()
           auxProfile.setProfileSet(profileSet)
           auxProfile.ReadPartialAssignment(labelFile,nrModels=num_clusters)
           labbeled = []
           test = []
           print 'partials', auxProfile.partials
           for c in auxProfile.partials:
             if c != None:
               labbeled = labbeled+c
             labbeled.sort()
           for i in range(len(profileSet)):
               if i not in labbeled:
                 test.append(i)
           print labbeled, test
        
           profileOrig = profileSet
	   profileSet = profileOrig.getSubset(labbeled)
	   profileTest = profileOrig.getSubset(test)
        else:
          labbeled = range(len(profileSet))
          

	if crossvalidate > 0.0:
	   size = len(profileSet)
	   testSet = sample(range(size),int(size*crossvalidate))
           testSet.sort()
	   trainSet = []
	   includedGenes = numpy.zeros(size)
	   for i in range(size):
	     if i not in testSet:
	       trainSet.append(i)
	       #includedGenes[i] = 1
           print testSet
  
	   if constraints != None:
	     orig_constraints = copy.deepcopy(constraints)
	     constraints = numpy.take(constraints,trainSet,axis=0)
	     constraints = numpy.take(constraints,trainSet,axis=1)
	     test_constraints =  numpy.take(orig_constraints,testSet,axis=0)
	     test_constraints =  numpy.take(test_constraints,testSet,axis=1)    
	   if  neg_constraints != None:
	     orig_neg_constraints = copy.deepcopy(neg_constraints)	   
	     neg_constraints = numpy.take(neg_constraints,trainSet,axis=0)
	     neg_constraints = numpy.take(neg_constraints,trainSet,axis=1)
	     test_neg_constraints =  numpy.take(orig_neg_constraints,testSet,axis=0)
	     test_neg_constraints =  numpy.take(test_neg_constraints,testSet,axis=1)    	     

	   profileOrig = profileSet   
	   profileSet = profileOrig.getSubset(trainSet)
	   profileTest = profileOrig.getSubset(testSet)
	   
	   file = open(fileName+'-'+str(id)+'test.txt','w')
	   for i in testSet:
		file.write(str(i)+'\n')
	   file.close()

	   #print 'depois do corte', len(profileOrig)
	   #print len(profileSet)
	   #print len(testSet)
	   if constraints != None:
 	     print len(constraints)
	     print len(test_constraints)
	   	   
	    
        duration = len(profileSet[0])
        random = random_models_args()
	random.nr_states = nr_states
        random.dimension = dimension

        # setting initial mean to data mean
        #means = []
        #prof = numpy.array(profileSet.profile)
        #for i in range(dimension):
          #indices = range(i,duration,dimension)
          #values = numpy.take(prof,indices)
          #means.append(numpy.mean(numpy.mean(values)))
         

        random.default_mean = profileSet.meanValues(dimension=dimension)

	profiles = replications(profileSet,random,repetitions,initializationMethod,estimationMethod,
				inputModel,labels, labelFile, constraints,neg_constraints,
                                prior_weight,prior_neg_weight,prior_type,maxIter)

	# evalutation of results


        outfile = open("ctt-"+outputFile+"-%i-%s.txt"%(num_clusters,nr_states),'w')
        
        rands = []
        sens = []
        spes = []
        pps = []
        nps = []
        errors = []
	aics = []
	bics = []
	ibics = []
	lls = []
	diags = []
	diagsn = []
        conts = []

        for i,p in enumerate(profiles):
             lls.append(p.ml)
             
        for i,p in enumerate([profiles[numpy.argmax(lls)]]):		
            lls.append(p.ml) 
	    if eval:
              filter = []              
              if onlyLabels==1:
                filter=test
                #print 'test', test
                p.setProfileSet(profileOrig)
                p.max_iter=0
                if(estimationMethod == 'CM'):
		    p.prior =  0
		    p.prior_neg =  0
                    constraints_aux=  numpy.zeros((len(profileOrig),len(profileOrig)),numpy.float)
                    neg_constraints_aux=  numpy.zeros((len(profileOrig),len(profileOrig)),numpy.float)
		    p.constraints = [constraints_aux,neg_constraints_aux]
		    p.computeClustering('MixtureConstrained')
                else:
                  p.computeClustering('MixtureExt')
                  #data = mixtureHMM.SequenceDataSet()          
                  #data.fromGHMM([],[profileOrig.GHMMProfileSet()])
                  #cluster = p.mixtureModel.classify(data)
                
              
                
              (rand, spe, sen, contigencyTable,error,pp,np,ctAll) = resultsEvaluation(p,viterbiGroups,filter=filter)
              rands.append(rand)
              sens.append(sen)
              spes.append(spe)
              errors.append(error)
              pps.append(pp)
              nps.append(np)
              outfile.write(str(contigencyTable.tolist())+'\n')
              outfile.write(str(ctAll.tolist()))
              
	    else:
	      rand = 0
	      sen = 0
	      spe = 0
              error = 0
	      rands.append(0)
	      sens.append(0)
	      sens.append(0)
              errors.append(0)
              pps.append(0)
              nps.append(0)	      	    

    	    if(noOfClusters):
                size = len(profileSet)
	        aic = GQLValidation.computeAIC(p.ml,p)
	    	bic = GQLValidation.computeBIC(p.ml,p, size)
		ibic = GQLValidation.computeICL(p.ml,p, size)
	        #outfile.write("\t%f\t%f\t%f"%(aic,bic,ibic))
		aics.append(aic)
		bics.append(bic)
		ibics.append(ibic)

            if(filterGenes):
              outFileAux = fileName[:-4]+'-genes.txt'
              file = open(outFileAux,'r')
              lines = file.readlines()
              print outFileAux
              l = lines[0].strip('\n')              
              geneFiles = l.split('\t')              
              profileAux = GQLCluster.ProfileSet()
              uniSeqsList = []
              genes = []
              for gf in geneFiles:
                profileAux.ReadDataFromCaged('data/'+gf, end=0)
                aux = profileAux.getSubset(labbeled)
                uniSeqsList.append(aux.ghmm_seqs)
                genes.append(gf[len(fileName[:-10]):-4])
              values = feature_selection.feature_selection(p.modelList(),p.alpha,uniSeqsList,dimension,profileSet.seq_classes)
              values = [(s,genes[i],i) for i,s in enumerate(values)]
              values.sort()            
              print 'filter', values
              v = []
              for (i,j,l) in values:
                v.append(l)
              print genes, v
              

	    #outfile.write("%i\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n"%(repetitions,rand, spe, sen, error, pp, np, p.ml,diag,diagn))

        outfile.close()
		    

        bestProfile = profiles[numpy.argmax(lls)]
        bestModels = bestProfile.modelList()
        outFile = outputFile+'-'+str(num_clusters)+'-'+str(nr_states)
        bestProfile.writeModels(outFile+'.xml','HMM')
        bestProfile.writeSeqMixDistributions(outFile+'.spd')

        if filterGenes:
          fileFilter = open(outFile+'.gen','w')
          for (v,g,i) in values:
            fileFilter.write(g+'\t'+str(i)+'\t'+str(v)+'\n')
          fileFilter.close()
        
        outfile = open("res-"+outputFile+"-%i-%s.txt"%(num_clusters,nr_states),'w')

        if(initializationMethod in ('L1','L2','L') ):
	    if( prior_type != 0):
 	        outfile.write("%f\t%f\t"%(prior_weight,prior_neg_weight))

		
 	    if eval:
	        outfile.write("%f\t%f\t%f\t%f\t%f\t%f\t%f\t"%(labels,
                                                              numpy.mean(errors),
                                                              numpy.mean(rands),
							      numpy.mean(spes),
							      numpy.mean(sens),
							      numpy.mean(pps),
                                                              numpy.mean(nps)))
	    if data != '':
                outfile.write("%f\t%f\t"%(cr,crt))
		
	    if( noOfClusters):
 	        outfile.write("%f\t%f\t%f\t%f\t%f\t%f\t%f\t"%(labels,numpy.mean(aics),
							      numpy.mean(aics),numpy.mean(bics),
							      numpy.mean(bics),numpy.mean(ibics),
							      numpy.mean(ibics)
							      ))

	    outfile.write(" ".join(argv)+"\n")	    
	else:
	    if( prior_type != 0):
 	        outfile.write("%f\t%f\t"%(prior_weight,prior_neg_weight))
	    if eval:
   	        outfile.write("%f\t%f\t%f\t%f\t%f\t%f\t%f\t"%(labels,
                                                              numpy.mean(errors),           
                                                              numpy.mean(rands),
							     numpy.mean(spes),
							     numpy.mean(sens),
							      numpy.mean(pps),
                                                              numpy.mean(nps)))            
	    if data != '':
                outfile.write("%f\t%f\t"%(cr,crt))		    
		    
	    if( noOfClusters):
 	        outfile.write("%f\t%f\t%f\t%f\t%f\t%f\t"%(numpy.mean(aics),
							      numpy.mean(aics),numpy.mean(bics),
							      numpy.mean(bics),numpy.mean(ibics),
							      numpy.mean(ibics)))

	    
	    outfile.write("\t".join(argv)+"\n")


	outfile.close()











