#!/usr/bin/env python2.3
################################################################################
#
#       This file is part of the GQL (Graphical Query Language) Toolkit
#
#       file:   GQLValidation.py
#       author: Ivan Costa (filho@molgen.mpg.de)
#
#       Copyright (C) 2003-2004 Alexander Schliep and Ivan Costa
#
#       Contact: filho@molgen.mpg.de
#
#       Information: http://ghmm.org/gql
#
#   GQL is free software; you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#   (at your option) any later version.
#
#   GQL is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with GQL; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
#
#       This file is version $Revision: 2125 $
#                       from $Date: 2009-10-22 17:32:35 -0300 (Thu, 22 Oct 2009) $
#             last change by $Author: filho $.
#
################################################################################

from numpy.oldnumeric import *
import math
import random
import string

################################################################################
#      Definitions of external indices.
################################################################################

def sensitivity(tp,fn,fp,tn):
    if(tp + fn) != 0:
        return  float(tp)/(tp + fn)
    else:
       return 0.0

def specificity(tp,fn,fp,tn):
    if (tn+fp) != 0.0:
        return float(tn)/(tn + fp)
    else:
        return 0.0

def ppv(tp,fn,fp,tn):
    if (tp+fp) != 0.0:
        return float(tp)/(tp + fp)
    else:
        return 0.0

def npv(tp,fn,fp,tn):
    if (fn+tn) != 0.0:
        return float(tn)/(fn + tn)
    else:
        return 0.0   

def rand(tp,fn,fp,tn):
    if (tp+fn+fp+tn) != 0:
        return float(tp+tn)/(tp+fn+fp+tn)
    else:
        return 0.0

def correctedRand(tp,fn,fp,tn,maxi=1):
    n = tp + fn + fp + tn
    if n != 0:
        r = rand(tp,fn,fp,tn)
        exp = rand((tp+fn)*(tp+fp),(fp+tn)*(tp+fp),(tp+fn)*(tn+fn),(tn+fn)*(tn+fp))
        return correctedIndex(r,exp,max=maxi)
    else:
        return 0.0

def correctedSen(tp,fn,fp,tn,maxi=1):
    n = tp + fn + fp + tn
    if n != 0:
        correction =  sensitivity((tp+fn)*(tp+fp),(fp+tn)*(tp+fp),(tp+fn)*(tn+fn),(tn+fn)*(tn+fp))
        return correctedIndex(float(tp)/(tp+fn),correction,max=maxi)
    else:
        return 0.0

def correctedSpe(tp,fn,fp,tn,maxi=1):
    n = tp + fn + fp + tn
    if n != 0:
        correction = specificity((tp+fn)*(tp+fp),(fp+tn)*(tp+fp),(tp+fn)*(tn+fn),(tn+fn)*(tn+fp))
        return correctedIndex(float(tp)/(tp+fp),correction,max=maxi)
    else:
        return 0.0

def correctedIndex(rawIndex, expectedIndex, max=1):
    '''this function correct a statistic for a given expected and maximum value
    '''
    if (max-expectedIndex) == 0:
        return 0.0 # XXX check this XXX
    else:
        return (rawIndex-expectedIndex)/(max-expectedIndex)

def computeExternalIndices(classes, clusters, no_classes, no_clusters, withTP = False, index = None):
    ''' For an array of class labels and an array of cluster labels
        compute corrected rand, hubert and so on.
        See "Jain & Dubes, Clustering methods and algorithms" for a matematical specification
	of the indices	'''
    classesTypes = {}
    count = 0
    newClasses = []
    for c in classes:
        if c not in classesTypes:
	    classesTypes[c] = count
	    count += 1
	newClasses.append(classesTypes[c])

    if index == None:
       [tp,fn,fp,tn] = computeErrors(classes, clusters)
    else: # leave one out
       [tp,fn,fp,tn] = computeErrorsLeaveOut(classes, clusters, index)

    cRand = correctedRand(tp,fn,fp,tn)
    randa = rand(tp,fn,fp,tn)
    sen = sensitivity(tp,fn,fp,tn)
    spe = specificity(tp,fn,fp,tn)
    pp = ppv(tp,fn,fp,tn)
    np = npv(tp,fn,fp,tn)    
    ct = contigencyTable(classes, clusters, no_classes, no_clusters)

    # assumes that the matrix is diagonalized
    #error = 1 - float(sum(diagonal(ct)))/float(sum(sum(ct)))
    error = 0
    for i in range(size(ct,1)):
      error = error + (max(ct[:,i]) - sum(ct[:,i]))

    if withTP:
        return (ct,cRand,spe,sen,randa,tp,fn,fp,tn, error,pp,np)
    else:
        return (ct,cRand,spe,sen,randa, error,pp,np)

    
    
def clusterDist(cluster):
    cluster = array(cluster)
    sizes = []
    for i in range(max(cluster)):
        sizes.append(sum(cluster==i+1))
    return sizes

################################################################################
#      Methods for calculating the tp, fn, fp, tn necessary in all external indices
################################################################################

def computeErrors(classes, clusters):
    """ For an array of class labels and an array of cluster labels
        compute true positives, false negatives, true negatives and
        false positives.
	In the context of external indices theses values corresponds to a, b, c and d.

        Assumes identical order of objects in both arrays.

        Class and cluster labels can be arbitrary data types supporting
        '==' operator.
    """
    tp = 0 # a
    fn = 0 # b
    fp = 0 # c
    tn = 0 # d

    # For all unordered pairs
    for i in xrange(len(classes)):
        for j in xrange(i+1, len(classes)):

            if classes[i] == classes[j]: # (i,j) is a positive
                if clusters[i] == clusters[j]:
                    tp += 1
                else:
                    fn += 1
            else: # (i,j) is a negative
                if clusters[i] == clusters[j]:
                    fp += 1
                else:
                    tn += 1
    return (tp, fn, fp, tn)

def computeErrorsLeaveOut(classes, clusters,leaveOut):
    """ For an array of class labels and an array of cluster labels
        compute true positives, false negatives, true negatives and
        false positives.
	In the context of external indices theses values corresponds to a, b, c and d.

        Assumes identical order of objects in both arrays.

        Class and cluster labels can be arbitrary data types supporting
        '==' operator.
    """
    tp = 0 # a
    fn = 0 # b
    fp = 0 # c
    tn = 0 # d

    i = leaveOut;

    # For all pairs

    for j in xrange(len(classes)):
        if j != i: 
            if classes[i] == classes[j]: # (i,j) is a positive
                if clusters[i] == clusters[j]:
                    tp += 1
                else:
                    fn += 1
            else: # (i,j) is a negative
                if clusters[i] == clusters[j]:
                    fp += 1
                else:
                    tn += 1
    return (tp, fn, fp, tn)    

def computeErrorsFromMatrices(classes,clusters):
    ''' for two matrices nxn with the correspondence of
    pairs of elements in a partition/mixture, given the
    number (ratio) of true positives, false positives, true negatives
    and false negatives '''
    tp = 0.0
    fn = 0.0
    fp = 0.0
    tn = 0.0

    for i in range(len(classes)):
         for j in range(i+1,len(classes)):

            tp += classes[i,j]*clusters[i,j]
            tn += (1-classes[i,j])*(1-clusters[i,j])
            fp += (1-classes[i,j])*clusters[i,j]
            fn += classes[i,j]*(1-clusters[i,j])
    return (tp, fn, fp, tn)


def computeProbabilisticErrors(mixtureclasses, mixtureclusters,mask=[],type=1):
    """ Attempt to optmize the computeProbabilisticErrorsOld ... for type 1 and 3 it is much
        faster (since it used dot from Numeric), but this is not the case of
        type=2 and type=4
        

        For two posteriors distributions from the same set of objects
        compute true positives, false negatives, true negatives and
        false positives.
	In the context of external indices theses values corresponds to a, b, c and d.

        Assumes identical order of objects in both arrays.
    """

    if mask == []:
        mask = ones(len(mixtureclasses),int)

    tp = 0 # a
    fn = 0 # b
    fp = 0 # c
    tn = 0 # d
    

    cls = len(mixtureclasses[0])
    cts = len(mixtureclusters[0])

    if (type == 1 or type ==3):
        pairAgreementClas = dot(mixtureclasses,transpose(mixtureclasses))
        pairAgreementClus = dot(mixtureclusters,transpose(mixtureclusters))

    # For all unordered pairs

    #print pairAgreementClas

    rangeCls = range(cls)
    rangeCts = range(cts)
    for i in xrange(len(mixtureclasses)):
        if mask[i]:
            for j in xrange(i+1, len(mixtureclasses)):
                if mask[j]:
		    if( type == 2 or type == 4 or type == 5):
                        pcs = sum(minimum(mixtureclasses[i],mixtureclasses[j]))
                        pct = sum(minimum(mixtureclusters[i],mixtureclusters[j]))
                    else:
                        pcs = pairAgreementClas[i,j]
                        pct = pairAgreementClus[i,j]

                    if( type == 3 or type == 4):
		        tp += min(pcs,pct)
			tn += 1 - max(pcs,pct)
			fp += max(pct-pcs,0)
			fn += max(pcs-pct,0)
		    elif(type == 1 or type == 2):
		        tp += pcs*pct
                        tn += (1-pcs)*(1-pct)
                        fn += pcs*(1-pct)
                        fp += (1-pcs)*pct

                    if( type ==5):
                        #print i, j, minimum(mixtureclasses[i],mixtureclasses[j])
                        #print i, j, minimum(mixtureclusters[i],mixtureclusters[j])
                        
                        
                        pcs = max(minimum(mixtureclasses[i],mixtureclasses[j]))
                        pct = max(minimum(mixtureclusters[i],mixtureclusters[j]))


                        pcsn = 0.0
                        for k in range(len(mixtureclasses[i])):
                          for l in range(k+1,len(mixtureclasses[i])):
                            pcsn = max(min(mixtureclasses[i,k],mixtureclasses[j,l]),pcsn)
                        pctn = 0.0
                        for k in range(len(mixtureclusters[i])):
                          for l in range(k+1,len(mixtureclusters[i])):
                            pctn = max(min(mixtureclusters[i,k],mixtureclusters[j,l]),pctn)

                        #print min(pcs,pct) 
                            

                        tp += min(pcs,pct)                        
                        fn += min(pcsn,pct)
                        fp += min(pcs,pctn)
                        tn += min(pcsn,pctn)

                        
    return (tp, fn, fp, tn)

################################################################################
#      Methods for obtaining contigency tables
################################################################################


def contigencyTable(classes, clusters, no_classes, no_clusters):
    ''' For an array of class labels and an array of cluster labels
        compute the contigency table classes x clusters
        See "Jain & Dubes, Clustering methods and algorithms" '''

    contigencyTable = zeros((no_classes, no_clusters), int)

    # gambiarra ....
    #nim_class = min(classes)
    #if( nim_class != 0):
    #    nim_class = 1

    # gambiarra ....
    #min_clus = min(clusters)
    #if(  min_clus!= 0):
    #    min_clus = 1
    min_class=0
    min_clus=0
      

    #print(nim_class)
    for i in xrange(len(classes)):
	contigencyTable[classes[i]-min_class,clusters[i]-min_clus] += 1
    return contigencyTable
    # construction of the contigency table classes x clusters

def contigencyTableFromMixture(mixtureCluster,mixturePartition):
    return dot(transpose(mixtureCluster),mixturePartition)


################################################################################
#      Internal Indices for finding the number of components
################################################################################


def computeBIC(logLikehood,profile,numberOfSequences):
    """ computes the Bayesian Information Criterion given the loglikehood of the Sequences
    given the model, the number of input sequences and the list of models
    (used to compute the number of paramenters in the models """
    d=0
    if profile.type == 'HMM':
      modelList = profile.modelList()
      for m in modelList:
         d += (m.N-1)*3 # 3 represents the number of parameters per state
    else: # diagonal gaussian
      d=2*len(profile.profileSet[0])*profile.getNoModels()

    #print logLikehood, d, d*math.log(numberOfSequences)
    return -2*logLikehood + d*math.log(numberOfSequences)

def computeBICFullMixture(logLikehood,numberOfSequences,dimention,nocomponents):
    """ computes the Bayesian Information Criterion given the loglikehood of the Sequences
    given the model, the number of input sequences and the list of models
    (used to compute the number of paramenters in the models """

    # free paramenters per model / taken from Figueiredo and Jain - check this
    dm = dimention + dimention*(dimention-1)/2
    # total free paramenters
    d = nocomponents + nocomponents*dm

    return -2*logLikehood + d*math.log(numberOfSequences)

def computeICL(logLikehood,profile,numberOfSequences):
    """ computes the Integrated Classification Likelihood criterion given the
    loglikehood of the Sequences given the model, the number of input sequences
    and the list of models (used to compute the number of paramenters in the
    models) """
    d=0
    componentMembershipPosterior = profile.P
    if profile.type == 'HMM':
      modelList = profile.modelList()
      for m in modelList:
         d += (m.N-1)*3 # 3 represents the number of parameters per state
    else: # diagonal gaussian
      d=2*len(profile.profileSet[0])*profile.getNoModels()
      

    entropy = 0
    for i in xrange(numberOfSequences):
        for j in xrange(profile.getNoModels()):
            t = componentMembershipPosterior[i, j]
	    if t != 0:
                entropy -= t * math.log(t)

    return -2*logLikehood + 2*entropy + d*math.log(numberOfSequences)

def computeAIC(logLikehood,profile):
    """ computes Akaike's Information Criterion given the loglikehood of the Sequences
    given the model, the number of input sequences and the list of models
    (used to compute the number of paramenters in the models) """
    d = 0
    if profile.type == 'HMM':
      modelList = profile.modelList()
      for m in modelList:
         d += (m.N-1)*3 # 3 represents the number of parameters per state
    else: # diagonal gaussian
      d=2*len(profile.profileSet[0])*profile.getNoModels()

      
    return -2*logLikehood + 2*d

def mixtureOverlap(mixture):
    ''' calculates the overlap bettween distinct components
    by doing the vectorial product of each component obtaining
    a c X c matrix, where c is the number of componets '''
    c = len(mixture[0])
    n = len(mixture)

    overlap = zeros([c,c],Float)
    sum = zeros(c,Float)

    for i in range(c):
        for k in range(n):
            sum[i] += mixture[k,i]

    for i in range(c):
        for j in range(c):
            for k in range(n):
                overlap[i,j] += mixture[k,i]*mixture[k,j]
            overlap[i,j] = overlap[i,j]/(sum[i]*sum[j])

    return overlap

def partitionCoefficient(mixture):
    ''' degree a mixture can be used to define a partition
    as defined in Bezedk at el. (1995). '''
    c = 0.0
    for l in mixture:
        for e in l:
            c += e*e
    return c/len(mixture)


def pvalue(distribution,value,rightTail=1):
    ''' Returns the p-value from a given simulated distribution
    '''
    counts = 0
    for v in distribution:
        if v > value:
            counts += 1
    if rightTail:
        return float(counts)/len(distribution)
    else: #left tail
        return 1 - float(counts)/len(distribution)

################################################################################
#      Information Teoric based Measures
#      See BSB 2007 Paper (for mutual info) or Meila (for variation of info)
################################################################################

small_value = 1e-100


def log2(x):
    import numpy
    try:
      res = numpy.zeros((len(x),len(x[0])),float)
      for i in range(len(x)):
        for j in range(len(x[0])):
            try:
             res[i,j] = math.log(x[i,j],2)
            except OverflowError:
                print x[i,j]
    except:
      res = numpy.zeros((len(x),1),float)
      for i in range(len(x)):
            try:
             res[i] = math.log(x[i],2)
             res
            except OverflowError:
                print x[i]      

    return res.transpose()


def mutualInfoTerms(x,y,t=0):
    import numpy
    # relative entropy of x given y
    n = len(x)
    px = numpy.sum(x,0)/n
    py = numpy.sum(y,0)/n
    #print 'px', len(px), len(py)
    px = px + small_value
    py = py + small_value
    #print len(x), len(x[0])
    #print len(y), len(y[0])

    pxy = numpy.dot(numpy.transpose(x),y) #
    #y.numpy.transpose()
    #print len(x), len(x[0])
    #print len(y), len(y[0])    
    pxy = pxy/n

    pxygivenxy = (pxy/numpy.resize(py,(len(px),len(py))))/numpy.transpose(numpy.resize(px,(len(py),len(px))))

    pxygivenxy=pxygivenxy.transpose()
    

    #print pxygivenxy
    terms = pxy*log2(pxygivenxy+small_value)
    #print 'a'
    #terms = pxy*log2(pxy+small_value)
    h = -numpy.sum(terms,1)
    if t == 0:
      return h
    else:
      return h, terms
      

def mutualInfo(x,y):
    import numpy
    #print mutualInfoTerms(x,y)
    return -numpy.sum(mutualInfoTerms(x,y))

def entropy(x):
    import numpy
    n = len(x)
    px = numpy.sum(x,0)/n
    #print 'px', len(px)
    #print  sum(px*log2(px+small_value))
    return -sum(sum(px*log2(px+small_value)))
    
def variationInformation(x,y):
    #print  'aqui', entropy(x), entropy(y), 2*mutualInfo(x,y)
    return entropy(x) + entropy(y) - 2*mutualInfo(x,y)

def normMutualInformation(x,y):
    return mutualInfo(x,y)/math.sqrt(entropy(x)*entropy(y))



################################################################################
#      Auxuliary functions for
#      Creating handling matrices from posteriors or partitions
################################################################################


def matrixFromPartition(partition,n):
    ''' creates a co-ocurrance matrix from a hard partition with n objects.
    Because of matrix allocation time, this function is only worth doing
    when the test of null hypothesis is also being carried'''
    matrix = zeros([n,n],Float)
    for p in partition:
        for i in p:
           for j in p:
              matrix[i,j] = 1
              matrix[j,i] = 1
           matrix[i,i] = 1
    return matrix

def matrixFromPartitionWithOverlap(partition,n):
    ''' creates aco-ocurrance  matrix from a overlapping partition with n objects,
    where each occurence of object i is 1/(# i is appears in the partition).
    Because of matrix allocation time, this function is only worth doing
    when the test of null hypothesis is also being carried'''
    counts =  zeros(n,int)
    # how many occurencies of the object in the partitions
    for p in partition:
        for i in p:
            counts[i] += 1

    matrix = zeros([n,n],Float)
    for p in partition:
        s = len(p)
        for i in range(s):
        #for i in p:
           #for j in p:
           for j in range(i,s):
              matrix[p[i],p[j]] = 1.0/(counts[p[i]]*counts[p[j]])
              matrix[p[j],p[i]] = matrix[p[i],p[j]]
           matrix[p[i],p[i]] = 1
    return matrix

def matrixFromMixture(mixture):
    ''' creates a co-ocurrance matrix from a overlapping partition with n objects,
    where the occurance of each object is equal to multiplication of the distribution
    of the two objects
    Because of matrix allocation time, this function is only worth doing
    when the test of null hypothesis is also being carried'''
    n = len(mixture)
    matrix = zeros([n,n],Float)
    m = len(mixture[0])

    for i in range(n):
        for j in range(i+1,n):
            for c in range(m):
                matrix[i,j] += mixture[i][c]*mixture[j][c]
                matrix[j,i] = matrix[i,j]
        matrix[i,i] = 1.0
    return matrix

def partitionJoin(partition,notJoin):
    ''' join all classes from a given partition
    except for the class in notJoin'''
    newPartition = [partition[notJoin]]
    aux = []
    for i in range(len(partition)):
        if i != notJoin:
            aux = aux + partition[i]
    newPartition.append(aux)
    return newPartition

def mixtureJoin(mixture,notJoin):
    ''' join all classes from a given partition
    except for the class in notJoin'''
    newMixture = zeros((len(mixture),2),Float)
    for i in range(len(mixture)):
        #print  mixture[i][notJoin], newMixture[i][0]
        newMixture[i][0] = mixture[i][notJoin]
        newMixture[i][1] = 1.0-mixture[i][notJoin]
    return newMixture

def mixtureFromPartition(labels,no_classes):
     if (min(labels) == 1 ):
       add = 1
     else:
       add = 0
     mixture = zeros([len(labels),no_classes],Float)

     for i,l in enumerate(labels):
         #mixture[i] = mask[l-add]
	 mixture[i][l-add] = 1
     return mixture

def mixtureFromOverlappingartition(partition,size):
     mixture = zeros([size,len(partition)],Float)

     for i,c in enumerate(partition):
         for o in c:
           mixture[o][i] = 1
     mixture = transpose(transpose(mixture)/sum(mixture,1))                
     return mixture

def maxPosteriorMixture(mixture):
    res = zeros([len(mixture),len(mixture[0])],Float)
    argm = argmax(mixture)
    #print argm
    for i in range(len(mixture)):
        res[i][argm[i]] = 1.0
    return res

def readMixture(fileName,last=1,firstL=0,firstC=1):
    file = open(fileName,'r')
    lines = file.readlines()
    file.close()

    ids = []
    mix = []

    for l in lines[firstL:]:
        l = string.strip(l,'\r\n')
        items = l.split('\t')
	ids.append(items[0])
	aux = []
        # hack ...
        if last ==1:
          for i in items[firstC:]:
            if i:
              aux.append(float(i))
        else:
          for i in items[firstcC:-1]:
            if i:
              aux.append(float(i))  
	mix.append(aux)

    #print mix

    return ids, array(mix)

################################################################################
# old stuff ...
# old stuff ...
# old stuff ...
# old stuff ...
# old stuff ...
# old stuff ...
# old stuff ...
# old stuff ...
################################################################################


def computeRandomClustering(noIndividuals,noClusters):
    cluster = []
    for i in range(noIndividuals):
        cluster.append(random.randint(0,noClusters-1))
    return cluster

def computeRandomMixture(noIndividuals,noClusters):
    mixture = []
    for i in range(noIndividuals):
        aux = []
        for j in range(noClusters):
            aux.append(random.uniform(0,1))
        s = sum(aux)
        for j in range(noClusters):
            aux[j] = aux[j]/s
        mixture.append(aux)
    return mixture

def computeRandomClusteringFixed(noIndividuals,clustersDistributions):
    cluster = []

    acc = 0.0
    pdf = []
    soma = float(sum(clustersDistributions,0))
    for c in clustersDistributions:
        acc += c/soma
        pdf.append(acc)

    for i in range(noIndividuals):
        cluster.append( lowerThan(random.uniform(0.0,1.0),pdf ))
    return cluster

def computeRandomShuffle(clusterAssignment):
    aux = clusterAssignment[:]
    random.shuffle(aux)
    return aux


def lowerThan(value, sequence):
    for i,s in enumerate(sequence):
        if value < s:
            return i

def computeExternalIndicesOld(classes, clusters, no_classes, no_clusters):
    ''' For an array of class labels and an array of cluster labels
        compute corrected rand, hubert and so on.
        See "Jain & Dubes, Clustering methods and algorithms" for a matematical specification
	of the indices	'''

    #print no_classes, no_clusters

    [a,b,c,d] = computeErrors(classes, clusters)

    #print a,b,c,d

    no_pair = float(len(classes)*(len(classes)-1)/2.0)

    contigencyTable = zeros((no_classes, no_clusters), int)
    sum_classes = zeros(no_classes, int)
    sum_clusters = zeros(no_clusters, int)

    # gambiarra ....
    nim_class = min(classes)
    if( nim_class != 0):
        nim_class = 1
#    print(nim_class)

#    print len(classes)
#    print len(clusters)


    # construction of the contigency table classes x clusters
    for i in xrange(len(classes)):
#        print i
#	print classes[i]
#	print clusters[i]
	contigencyTable[classes[i]-nim_class,clusters[i]] += 1

	sum_classes[classes[i]-nim_class]+= 1
	sum_clusters[clusters[i]]+= 1

    comb_agreement = 0
    comb_classes = 0
    comb_clusters = 0

    for i in xrange(no_classes):
	comb_classes += sum_classes[i]*(sum_classes[i]-1)/2
        for j in xrange(no_clusters):
		comb_agreement += contigencyTable[i,j]*(contigencyTable[i,j]-1)/2

    for i in xrange(no_clusters):
	comb_clusters += sum_clusters[i]*(sum_clusters[i]-1)/2

    cRand = float((comb_agreement - comb_classes*comb_clusters/no_pair))/float(((comb_classes+comb_clusters)/2.0 - comb_classes*comb_clusters/no_pair))

    rand = (a+d)/float(a+b+c+d)

    return (contigencyTable,cRand,(a/float(a+c)),(a/float(a+b)),rand)


class SubMatrix:
    ''' class for defining subsets of a matrix
    or to allow a shufling of the matrixs rows/collums
    '''
    matrix = []
    indices = []
    def __init__(self,matrix,indices):
        self.matrix = matrix
        self.indices = indices
        #if max(indices) < len(matrix):
        #    print "error, index bigger than matrix size"
    def __getitem__(self, key):
        (i,j) = key
        #print i,j
        return self.matrix[self.indices[i]][self.indices[j]]
    def __len__(self):
        return len(self.indices)

def computeProbabilisticErrorsOld(mixtureclasses, mixtureclusters,mask=[],type=1):
    """ For an array of class labels and an array of cluster labels
        compute true positives, false negatives, true negatives and
        false positives.
	In the context of external indices theses values corresponds to a, b, c and d.

        Assumes identical order of objects in both arrays.

        Class and cluster labels can be arbitrary data types supporting
        '==' operator.
    """

    if mask == []:
        mask = ones(len(mixtureclasses),Float)

    tp = 0 # a
    fn = 0 # b
    fp = 0 # c
    tn = 0 # d

    cls = len(mixtureclasses[0])
    cts = len(mixtureclusters[0])

    # For all unordered pairs
    for i in xrange(len(mixtureclasses)):
        if mask[i]:
            for j in xrange(i+1, len(mixtureclasses)):
                if mask[j]:
                    pcs = 0.0
                    pct = 0.0

		    if( type == 2 or type == 4):
                        for c in range(cls):
                            pcs += min(mixtureclasses[i][c],mixtureclasses[j][c])
                        for c in range(cts):
                            pct += min(mixtureclusters[i][c],mixtureclusters[j][c])
		    else:

                        for c in range(cls):
                            pcs += mixtureclasses[i][c]*mixtureclasses[j][c]
                        for c in range(cts):
                            pct += mixtureclusters[i][c]*mixtureclusters[j][c]

                    if( type == 3 or type == 4):
		        tp += min(pcs,pct)
			tn += 1 - max(pcs,pct)
			fp += max(pct-pcs,0)
			fn += max(pcs-pct,0)
		    else:
		        tp += pcs*pct
                        tn += (1-pcs)*(1-pct)
                        fn += pcs*(1-pct)
                        fp += (1-pcs)*pct

    return (tp, fn, fp, tn)
def randomLabelNullHypothesis(classes,clusters,rep):
    ''' given two co-ocurance matrix of two partitions,
    it calculates the error/agreement of rep replications
    with the random label null hypothesis '''
    n = len(classes)
    indices = range(n)
    random.shuffle(indices)
    rand = indices

    #aux = zeros([n,n],Float)

    tp = zeros(rep,Float)
    fn = zeros(rep,Float)
    fp = zeros(rep,Float)
    tn = zeros(rep,Float)

    for r in range(rep):
        random.shuffle(indices)
        rand = indices
        #print rand

        for i in range(n):
          for j in range(i+1,n):
            #tp[r] += min(classes[rand[i],rand[j]],clusters[i,j])
            #tn[r] += 1 - max(classes[rand[i],rand[j]],clusters[i,j])
            #fp[r] += max(clusters[i,j]-classes[rand[i],rand[j]],0)
            #fn[r] += max(classes[rand[i],rand[j]]-clusters[i,j],0)
            tp[r] += classes[rand[i],rand[j]]*clusters[i,j]
            tn[r] += (1-classes[rand[i],rand[j]])*(1-clusters[i,j])
            fp[r] += (1-classes[rand[i],rand[j]])*clusters[i,j]
            fn[r] += classes[rand[i],rand[j]]*(1-clusters[i,j])
    return (tp, fn, fp, tn)

def randomLabelNullHypothesisProbabilistic(mixtureclasses, mixtureclusters,rep,mask=[],type=1):
    ''' given two co-ocurance matrix of two partitions,
    it calculates the error/agreement of rep replications
    with the random label null hypothesis '''
    n = len(mixtureclasses)
    #print "there are %s elements"%n
    indices = range(n)

    tps = zeros(rep,Float)
    fns = zeros(rep,Float)
    fps = zeros(rep,Float)
    tns = zeros(rep,Float)

    for r in range(rep):
        random.shuffle(indices)
        rand = indices

        #print "random %s"%r

        shuffledMixtureclusters = take(mixtureclusters,rand)

        (tp,fn,fp,tn) = computeProbabilisticErrors(mixtureclasses, shuffledMixtureclusters,mask=mask,type=type)
        tps[r] = tp
        fns[r] = fn
        fps[r] = fp
        tns[r] = tn

    return (tps, fns, fps, tns)

def computeGOErrors(cluster,mapping,genes):
    ''' Specific error calculation for go mappings. This implementations
    was done to speed up the more generic functions defined above
    '''
    tp = 0.0
    fn = 0.0
    fp = 0.0
    tn = 0.0

    #obs = len(mixtureclasses)
    obs = len(cluster)
    ct = 0

    # For all unordered pairs
    for i in xrange(obs):
        if genes[i] in mapping:
            ct += 1
            for j in xrange(i+1, obs):
                if genes[j] in mapping:
                    pcs = 0.0
                    pct = 0.0

                    pct = (cluster[i] == cluster[j])

                    nodesj = mapping[genes[j]]
                    nodesi = mapping[genes[i]]
                    for n in nodesj:
                        if n in nodesi:
                            pcs += 1
                    pcs = pcs/(len(nodesi)*len(nodesj))

                    tp += pcs*pct
                    tn += (1-pcs)*(1-pct)
                    fn += pcs*(1-pct)
                    fp += (1-pcs)*pct

    #print tp + fn + fp + tn
    return (tp, fn, fp, tn, ct)

def computeGOErrorsMixture(mixtureclusters,mapping,genes,filter=[]):
    ''' Specific error calculation for go mappings. This implementations
    was done to speed up the more generic functions defined above ...
    '''
    tp = 0.0
    fn = 0.0
    fp = 0.0
    tn = 0.0

    #obs = len(mixtureclasses)
    obs = len(mixtureclusters)
    for i in xrange(obs):
       if genes[i] in filter:
          if genes[i] in mapping:
            for j in xrange(i+1, obs):
	      if genes[j] in filter:
                if genes[j] in mapping:
                    pcs = 0.0
                    pct = 0.0

                    pct = sum(multiply(mixtureclusters[i],mixtureclusters[j]))

                    nodesj = mapping[genes[j]]
                    nodesi = mapping[genes[i]]
                    for n in nodesj:
                        if n in nodesi:
                            pcs += 1
                    pcs = pcs/(len(nodesi)*len(nodesj))

                    tp += pcs*pct
                    tn += (1-pcs)*(1-pct)
                    fn += pcs*(1-pct)
                    fp += (1-pcs)*pct

    #print tp + fn + fp + tn

    return (tp, fn, fp, tn)

def indexStatistics(tp,fn,fp,tn,tpd,fnd,fpd,tnp,statistics):
    obtainedValue = statistics(tp,fn,fp,tn)
    nullHyp = []
    for i in range(len(tpd)):
        nullHyp.append(statistics(tpd[i],fnd[i],fpd[i],tnp[i]))
    mean = float(sum(nullHyp))/float(len(nullHyp))
    return correctedIndex(obtainedValue,mean),obtainedValue,mean,pvalue(nullHyp,obtainedValue)

