#--------------------------------------------------------------------------
# Tool Name:  AMOEBA Clustering
# Source Name: AMOEBAClustering.py
# Version: 1.0
# Author: Jared Aldstadt and Yeming Fan
#
#--------------------------------------------------------------------------

#--------------------------------------------------------------------------
#Imports
#--------------------------------------------------------------------------
import sys, os, locale, math,pdf
# HelperFunctions.py contains functions and custom classed used by all
# Stat tools.
import HelperFunctions as HF
import galReader as GR
#--------------------------------------------------------------------------
#AMOEBA Tool Functions
#-------------------------------------------------------------------------- 
#   get_inputs -        parses sys.argv and returns an object of class gen
#                       that contains each user specified input argument.
#
#   build_value_lists - constructs dictionary structures for X, Y, and Z.
#
#   listGStat - calculates and returns a Gi Z Score for a list of keys.
#
#   gAMOEBAForIFast - computes an AMOEBA cluster for a given seed location.
#
#   output_results - constructs a new feature class with calculated results
#                    for each record.  Returns the results field name.
#--------------------------------------------------------------------------
def get_inputs():
    """Given the list of inputs for the script when run, return an object
    containing the variables for the script"""
    try:
        '''
        sInputFC = arguments[0]                     # Input Feature Class
        sZField = arguments[1].upper()              # Attribute Field Name
        sOutputFC = arguments[2]                    # Output Feature Class
        sWeights = arguments[3]                     # GAL Weights matrix file
        if sWeights == "#" or sWeights == "": sWeights = None
        
        '''

        sInputFC = HF.pGP.GetParameterAsText(0)                     # Input Feature Class
        sZField = HF.pGP.GetParameterAsText(1).upper()              # Attribute Field Name
        sOutputFC = HF.pGP.GetParameterAsText(2)                    # Output Feature Class
        sWeights = HF.pGP.GetParameterAsText(3)                     # GAL Weights matrix file
        coreCutoff = HF.pGP.GetParameterAsText(4)                  # Cutoff for core clusters
        confidenceLevel=HF.pGP.GetParameterAsText(5)
        method=HF.pGP.GetParameterAsText(6)
        
        '''
        #This block can be used to test the code using provided datasets
        sInputFC = "countryClusters.shp"
        sZField = "CASES"
        sOutputFC = "C:\TEMP\countryCore.shp"
        sWeights = "countryClusters.GAL"
        coreCutoff = "3.0"
        confidenceLevel="0.05"
        #method="FDR adjustment"
        #method="Cutoff adjustment"
        #method="Bonferroni adjustment"
                
        #method=1 ------ Cutoff adjustment
        #method=2 ------ FDR adjustment
        #method=3 ------ Bonferroni adjustment
        '''
        
    except: raise HF.ReportError (HF.msgParseErr)
    
    #Make a generic object, populate it with our variables, then return it
    try:
        obj = HF.Gen()
        obj.add('sInputFC',sInputFC)
        obj.add('sZField',sZField)
        obj.add('sOutputFC', sOutputFC)
        obj.add('bWtsFile', sWeights)
        obj.add('coreCutoff', coreCutoff)
        obj.add('method',method)
        obj.add('confidenceLevel',confidenceLevel)
    except: raise HF.ReportError (HF.msgInputsErr)
    return obj

def build_value_lists():
    """Reads through feature table extracting geometry and field info.
       Returns this information in dictionary structures."""
    # Z from input feature class.
    sKeys = []
    dcZ = {}
    dZSum = 0.0
    dZ2Sum = 0.0
    sFieldList = properties.sFID + ";" + inputs.sZField
    #print sFieldList,inputs.sOutputFC
    pRows = HF.pGP.SearchCursor(inputs.sOutputFC,"","",sFieldList)
    # pRows = pGP.SearchCursor(inputs.sOutputFC)
    pRow = pRows.Next()
    iRec = 0
    lsBadRecs = []
    iBadRecCnt = 0

    HF.pGP.AddMessage (HF.sBuildingLists)

    while pRow <> None:
        iRec = iRec + 1
        try:
            sKey = pRow.GetValue(properties.sFID)
            dTmp = locale.str(pRow.GetValue(inputs.sZField))
            dZ = locale.atof(dTmp)
            dcZ[sKey] = dZ
            sKeys.append(sKey)
            dZSum = dZSum + dZ
            dZ2Sum = dZ2Sum + dZ**2.0
        except:
            iBadRecCnt += 1
        pRow = pRows.Next()

    # Report any problems encountered reading feature input data.
    if iBadRecCnt:
        sMessage = HF.msgReadErr % (iBadRecCnt, iRec)
        HF.pGP.AddWarning (sMessage)
        HF.pGP.AddWarning(`lsBadRecs`)

    return sKeys, dcZ, dZSum, dZ2Sum

def listGStat(inDict, inList, zSum, z2Sum):
        """ This function performs the local Getis Ord statistic for a set
        of units that make up a region.

        Parameters:
        Name        Type                Description
        inDict      Dictionary          The data values in the study area.
        inList      List of Integers    A list of the units in the region of analysis.

        Output:
        Name        Type                Description
        gStat       Float               The Getis Ord local G statistic for this
                                        region.
        pos        Integer              An indicator equal to one if the local G
                                        statistic is greater than zero, and equal
                                        to 0 otherwise.
        """

        observations = len(inDict)*1.0
            #observations is the number of total units
        theSum = zSum
            #theSum is the sum of the variable values at all units
        squaredSum = z2Sum
            #squaredSum is the sum of the squared variable values at all units
        xbar = theSum/observations
            #xbar is the mean of the values at all units
        s = math.sqrt((squaredSum / observations)-(xbar**2))
            #s is the standard deviation of the values at all units

        #This block calculates the local G stat for the region of analysis, which
        # reduces to the z-score for a single unit.
        if len(inList) == 1:
        	numerator = inDict[inList[0]] - xbar
                    #numerator is the numerator of the local G statistic
        	denominator = s
                    #denominator is the denominator of the local G statistic	

        else:
            sumWeights = len(inList) * 1.0
                #sumWeights is the sum of the spatial weights - assumes binary
                # contiguity
            sumX = 0
            for j in inList:
                sumX += inDict[j]
                #sumX is the sum of the units is the region of analysis.
            numerator = sumX - (sumWeights * xbar)
                #numerator is the numerator of the local G statistic
            denominator = s * math.sqrt((observations*sumWeights - sumWeights**2) \
        		/(observations - 1))
                #denominator is the denominator of the local G statistic
    
        if numerator > 0:
            pos = 1
        else:
            pos = 0        
        gStat = (numerator/denominator)
    
        return gStat,pos

def pValForGi(gVal):
        """ This function uses the pdf module to calculate p values.
        
        Parameters:
        Name        Type                Description
        gVal        Float               A Gi statistic value (standard normal variate.

        Output:
        Name        Type                Description
        pVal        Float               A value from the normal or t probability distribution.
        """
        if gVal >= 0:
            if gVal < 6.0:
                pVal = 1.0-pdf.zprob(gVal)
            else:
                pVal = pdf.tpvalue(gVal,3000)
        else:
             if gVal > -6.0:
                pVal = pdf.zprob(gVal)
             else:
                pVal = pdf.tpvalue(gVal,3000)
        return 2.0*pVal

def contToIn(inList,w):
        """ This function finds all units that are contiguous to a list of units.

        Parameters:
        Name        Type                Description
        inList      List of Integers    The list of units.
        w           Dictionary          A sparse binary contiguity matrix.

        Output:
        Name        Type                Description
        rList       List of Integers    The list of units contiguous to i.
        """
        rList = []
        for i in inList:
            iList = w[i]
                #iList is a list of units contiguous to unit i.
            for j in iList:
                if inList.count(j) == 0 and rList.count(j) == 0:
                    rList.append(j)
        return rList

def gAMOEBAForIFast(i,num,w,zSum,z2Sum):
        """ This is the AMOEBA clustering function.  It finds a cluster using
        the ith observation as the seed location.  The procedure is described
        in this dissertion and in Aldstadt and Getis (2006).  This function uses
        the local G statistic as calculated in the listGStat function above.

        Parameters:
        Name        Type                Description
        i           Integer             The seed location
        num         Dictionary          The data values 
        w           Disctionary         A sparse binary contiguity matrix.

        Output:
        Name        Type                Description
        inList      List of Integers    A list of the units in the cluster.
        listGStat   Float               The Getis Ord local G statistic for this
                                        cluster.
        """

        #Initializing Variables:
        inList = [i]
        outList = []
            #outList is a running list of observations excluded from consideration.
        lastStat = 0
            #lastStat is the statistic value at the last level of contiguity
        stat,pos0 = listGStat(num,inList,zSum,z2Sum)
        #print stat,pos0
            #stat is the statistic value for the current AMOEBA
            #pos0 is an indicator, 1 indicates a cluster of high values.

        #This block iterated through orders of contiguity until the clustering
        # statistic value no longer increases.
        while abs(stat) > abs(lastStat):
            lastStat = stat
                #lastStat is the maximum statistic found for the previous level
                # of contiguity
            contList = contToIn(inList,w)
                #contList is a list of units contiguous to the current AMOEBA
            values = []
            valuDict = {}
            for j in contList:
                temp = num[j]
                values.append(temp)
                valuDict[temp] = j
                #values is the variable values for contiguous units
            #orderList = values.argsort().tolist()
            orderKeys = valuDict.keys()[:]
            orderKeys.sort()
            orderList = []
            for j in orderKeys:
                orderList.append(valuDict[j])
                #orderList is the indexes of the contList from low to high based on values
            #print orderList
            if pos0 == 1:
                #switch the list if you are making a cluster of high values
                orderList.reverse()
            #print orderList
            for j in orderList:
                #this loop adds units until the statistic is no longer increased
                aList = inList[:]
                #aList is a local copy of the inList
                aList.extend([j])
                tempStat,pos1 = listGStat(num,aList,zSum,z2Sum)
                #print aList,tempStat,pos1
                if abs(tempStat) < abs(stat) or pos1 <> pos0:
                    break
                else:
                    stat = tempStat
                    inList = aList

        return inList,listGStat(num,inList,zSum,z2Sum)[0]

def output_results(mDict,cDict):
    """ This function writes results to the output feature class. """
    sField = "MaxGi"
    cField = "Core"
    # Add results field.
    if not properties.dcFields.has_key(sField.upper()):
        HF.pGP.AddField(inputs.sOutputFC, sField, "FLOAT")
    if not properties.dcFields.has_key(cField.upper()):
        HF.pGP.AddField(inputs.sOutputFC, cField, "TEXT")

    # Add results to output FC
    HF.pGP.AddMessage (HF.sWritingResults)
    sFieldList = properties.sFID + ";" + sField + ";" + cField
    pRows = HF.pGP.UpdateCursor(inputs.sOutputFC,"","",sFieldList)
    #pRows = pGP.UpdateCursor(inputs.sOutputFC)
    pRow = pRows.Next()
    iCnt = 0
    fInterval = len(keys) / 5.0
    fMore = fInterval
    iComplete = 20
    while pRow <> None:
        iKey = pRow.GetValue(properties.sFID)
        try:
            if mDict [iKey]:  # make sure we have a non-Null result.
                pRow.SetValue(sField, mDict[iKey])
                pRows.UpdateRow(pRow)
            iCnt = iCnt + 1
            if iCnt > fInterval: fInterval, iComplete = HF.check_progress(fInterval, fMore, iComplete)
        except: pass
        try:
            if cDict [iKey]:  # make sure we have a non-Null result.
                pRow.SetValue(cField, cDict[iKey])
                pRows.UpdateRow(pRow)
        except:
            pRow.SetValue(cField, "Outside Clusters")
            pRows.UpdateRow(pRow)
        pRow = pRows.Next()
    HF.pGP.AddMessage (HF.s100Percent)
    HF.pGP.AddMessage(" ")
    pRows = None           
    return sField    
#--------------------------------------------------------------------------

#--------------------------------------------------------------------------
#MAIN

if __name__ == "__main__":
    """This function performs the 1995 Getis and Ord Gi* statistic."""
    
    #Get input arguments, construct an "inputs" object
    #inputs = get_inputs()

    inputs = get_inputs()
    iNumRecs = HF.pGP.GetCount(inputs.sInputFC)
    if iNumRecs < 30:
        HF.pGP.AddWarning (msgFewRecsWrn)
    
    if HF.pGP.exists(inputs.sOutputFC):
        HF.pGP.delete(inputs.sOutputFC)

    #Copy the input feature class to the output feature class.
    try:
        HF.pGP.QualifiedFieldNames = 0
        HF.pGP.Copyfeatures(inputs.sInputFC, inputs.sOutputFC)
    except:
        sMessage = HF.msgOverwriteErr % (inputs.sOutputFC)
        raise HF.ReportError (sMessage)

    sMethod=inputs.method
    if sMethod=="Cutoff adjustment":
        method="1"
    elif sMethod=="FDR adjustment":
        method="2"
    elif sMethod=="Bonferroni adjustment":
        method="3"

    #print method
    #Match input fc field names to output fc field names... these could
    # change if the input fc has a join
    pInFields = HF.pGP.ListFields(inputs.sInputFC)
    pOutFields = HF.pGP.ListFields(inputs.sOutputFC)
    pInField = pInFields.next()
    pOutField = pOutFields.next()
    while pInField:
        sName = (pInField.Name).upper()
        if sName == inputs.sZField:
            inputs.sZField = (pOutField.Name).upper()
        pInField = pInFields.next()
        pOutField = pOutFields.next()

    #Get and check feature class information: fields, shapefile, etc.
    properties = HF.get_featureclass_properties(inputs.sOutputFC)
    
    HF.check_field_properties(inputs, properties)
    HF.check_numeric_properties(inputs)


    # Get X, Y and Z values.
    keys,zDict,zSumG,z2SumG = build_value_lists()

    #Read GAL File
    w = GR.galReader(inputs.bWtsFile,firstIndex=1)

    if len(keys) <> len(w):
        HF.pGP.AddWarning("Error: Length of data vector and weights matrix do not match")

    # Calculate the AMOEBA Clusters.

    clusterHighList = []
    clusterLowList = []
    maxDict = {}
    maxG = -9999
    maxCluster = []
    minG = 9999
    minCLuster = []
    iCnt = 1
    fInterval = len(keys) / 5.0
    fMore = fInterval
    iComplete = 20
    HF.pGP.AddWarning("Finding AMOEBA Clusters with "+sMethod+"...")
    for i in keys:
        aList,aStat = gAMOEBAForIFast(i,zDict,w,zSumG,z2SumG)
     
        if aStat > maxG:
            maxG = aStat
            maxCluster = aList
        elif aStat < minG:
            minG = aStat
            minCluster = aList
            
        if aStat > 0:
            clusterHighList.append([aList,aStat])
        else:
            clusterLowList.append([aList,aStat])
            
        for j in aList:
            if maxDict.has_key(j):
                if abs(aStat) > abs(maxDict[j]):
                    maxDict[j] = aStat
            else:
                maxDict[j] = aStat
                
        iCnt += 1
        if iCnt > fInterval:
            fInterval, iComplete = HF.check_progress(fInterval, fMore, iComplete)

    if method=="2":
        HF.pGP.AddWarning("Finding FDR...")
        data={}
        data=maxDict.copy()
        for j in range(0,iNumRecs):
            #print data[j],
            data[j]=[pValForGi(data[j]),data[j]] 
            #print data[j][0]
        alpha=float(inputs.confidenceLevel)
        for j in range(0,iNumRecs-1):
            for i in range(0,iNumRecs-1):
                if data[i][0] < data[i+1][0]:
                    tem=data[i]
                    data[i]=data[i+1]
                    data[i+1]=tem
                    
        #print data,alpha/iNumRecs
        for i in range(0,iNumRecs):
            if data[i][0]<=(i+1)*alpha/iNumRecs:
                pFDR=data[i][0]
                gFDR=abs(data[i][1])
                break;
        HF.pGP.AddWarning("FDR Cutoff Gi*: %f" %gFDR)
        #print pFDR,gFDR

    if method=="3":
        alpha=float(inputs.confidenceLevel)
        pBonferroni =alpha/iNumRecs
        HF.pGP.AddWarning("Bonferroni adjusted p-vaule: %f" %pBonferroni)  
        #print pBonferroni
        
    if method=="1": #User Specified Cutoff value
        coreDict = {} #Core Dictionary holds the resulting core cluster value (High, Low, Outside)
        #Finding Core Clusters of High Values
        HF.pGP.AddWarning("distilling core clusters...")
        if maxG > float(inputs.coreCutoff):

            #This loop removes all clusters with statistic values
            # lower than the cutoff value from further consideration
            tempClusters = []
            for c in clusterHighList:
                if c[1] > float(inputs.coreCutoff):
                    tempClusters.append(c)
            clusterHighList = tempClusters[:]

            #This loop assigns "High Cluster" to the core high clusters (non-overlapping)
            while maxCluster <> []:
                #print maxG, maxCluster , len(clusterHighList)
                for i in maxCluster:
                    coreDict[i] = "High Cluster"

                tempClusters = []
                tempMaxCluster = []
                tempMaxStat = -9999
                for c in clusterHighList:
                    overlap = 0
                    for j in maxCluster:
                        if c[0].count(j) > 0:
                            overlap = 1
                            break
                    if overlap == 0:
                        tempClusters.append(c)
                        if c[1] > tempMaxStat:
                            tempMaxStat = c[1]
                            tempMaxCluster = c[0]
                if tempMaxStat > float(inputs.coreCutoff):
                    maxCluster = tempMaxCluster
                    clusterHighList = tempClusters[:]
                else:
                    maxCluster = []
                        
        if minG < (-1 * float(inputs.coreCutoff)):

            #This loop removes all clusters with statistic values
            # lower than the cutoff value from further consideration
            tempClusters = []
            for c in clusterLowList:
                if c[1] < (-1*float(inputs.coreCutoff)):
                    tempClusters.append(c)
            clusterLowList = tempClusters[:]

            #This loop assigns "Low Cluster" to the core high clusters (non-overlapping)
            while minCluster <> []:

                for i in minCluster:
                    coreDict[i] = "Low Cluster"

                tempClusters = []
                tempMinCluster = []
                tempMinStat = 9999
                for c in clusterLowList:
                    overlap = 0
                    for j in minCluster:
                        if c[0].count(j) > 0:
                            overlap = 1
                            break
                    if overlap == 0:
                        tempClusters.append(c)
                        if c[1] < tempMinStat:
                            tempMinStat = c[1]
                            tempMinCluster = c[0]
                if tempMaxStat < (-1 * float(inputs.coreCutoff)):
                    minCluster = tempMinCluster
                    clusterLowList = tempClusters[:]
                else:
                    minCluster = []
    
    if method=="2" or method == "3":
        if method == "2":
            pCut = pFDR
        elif method == "3":
            pCut = pBonferroni
            
        coreDict = {}
        #Finding Core Clusters of High Values
        HF.pGP.AddWarning("distilling core clusters...")
        probMaxG = pValForGi(maxG)
        if probMaxG < pCut:
            #This loop removes all clusters with statistic values
            # lower than the cutoff value from further consideration
            tempClusters = []
            for c in clusterHighList:
                if pValForGi(c[1]) < pCut:
                    tempClusters.append(c)
            clusterHighList = tempClusters[:]

            #This loop assigns "High Cluster" to the core high clusters (non-overlapping)
            while maxCluster <> []:
                for i in maxCluster:
                    coreDict[i] = "High Cluster"

                tempClusters = []
                tempMaxCluster = []
                tempMaxStat = -9999
                for c in clusterHighList:
                    overlap = 0
                    for j in maxCluster:
                        if c[0].count(j) > 0:
                            overlap = 1
                            break
                    if overlap == 0:
                        tempClusters.append(c)
                        if c[1] > tempMaxStat:
                            tempMaxStat = c[1]
                            tempMaxCluster = c[0]
                if pValForGi(tempMaxStat)< pCut:
                    maxCluster = tempMaxCluster
                    clusterHighList = tempClusters[:]
                else:
                    maxCluster = []

        probMinG = pValForGi(minG)               
        if probMinG < pCut:
            #This loop removes all clusters with statistic values
            # lower than the cutoff value from further consideration
            tempClusters = []
            for c in clusterLowList:
                if pValForGi(c[1]) < pCut:
                    tempClusters.append(c)
            clusterLowList = tempClusters[:]

            #This loop assigns "Low Cluster" to the core high clusters (non-overlapping)
            while minCluster <> []:
            
                for i in minCluster:
                    coreDict[i] = "Low Cluster"

                tempClusters = []
                tempMinCluster = []
                tempMinStat = 9999
                for c in clusterLowList:
                    overlap = 0
                    for j in minCluster:
                        if c[0].count(j) > 0:
                            overlap = 1
                            break
                    if overlap == 0:
                        tempClusters.append(c)
                        if c[1] < tempMinStat:
                            tempMinStat = c[1]
                            tempMinCluster = c[0]
                if pValForGi(tempMinStat)< pCut:
                    minCluster = tempMinCluster
                    clusterLowList = tempClusters[:]
                else:
                    minCluster = []

            
    # Create output feature class with results field
    sField = output_results(maxDict,coreDict)


    #Wrap it up.
    try:
        HF.pGP.SetParameterAsText (5,sField)  # This may trigger a pythonwin error,
                                              # but will run okay outside pythonwin
    except:
        HF.pGP.AddWarning(HF.msgDerivedOutputErr)
        pass
    pGP = None
#==========================================================================
