lucene/solr/contrib/ltr/example/libsvm_formatter.py

from subprocess import call
import os

PAIRWISE_THRESHOLD = 1.e-1
FEATURE_DIFF_THRESHOLD = 1.e-6

class LibSvmFormatter:
    def processQueryDocFeatureVector(self,docClickInfo,trainingFile):
        '''Expects as input a sorted by queries list or generator that provides the context
        for each query in a tuple composed of: (query , docId , relevance , source , featureVector).
        The list of documents that are part of the same query will generate comparisons
        against each other for training. '''
        with open(trainingFile,"w") as output:
            self.featureNameToId  = {}
            self.featureIdToName = {}
            self.curFeatIndex = 1;
            curListOfFv = []
            curQueryAndSource = ""
            for query,docId,relevance,source,featureVector in docClickInfo:
                if curQueryAndSource != query + source:
                    #Time to flush out all the pairs
                    _writeRankSVMPairs(curListOfFv,output);
                    curListOfFv = []
                    curQueryAndSource = query + source
                curListOfFv.append((relevance,self._makeFeaturesMap(featureVector)))
            _writeRankSVMPairs(curListOfFv,output); #This catches the last list of comparisons

    def _makeFeaturesMap(self,featureVector):
        '''expects a list of strings with "feature name":"feature value" pairs. Outputs a map of map[key] = value.
        Where key is now an integer. libSVM requires the key to be an integer but not all libraries have
        this requirement.'''
        features = {}
        for keyValuePairStr in featureVector:
            featName,featValue = keyValuePairStr.split("=");
            features[self._getFeatureId(featName)] = float(featValue);
        return features

    def _getFeatureId(self,key):
        if key not in self.featureNameToId:
                self.featureNameToId[key] = self.curFeatIndex;
                self.featureIdToName[self.curFeatIndex] = key;
                self.curFeatIndex += 1;
        return self.featureNameToId[key];

    def convertLibSvmModelToLtrModel(self,libSvmModelLocation,outputFile,modelName,featureStoreName):
        with open(libSvmModelLocation, 'r') as inFile:
            with open(outputFile,'w') as convertedOutFile:
                # TODO: use json module instead of direct write
                convertedOutFile.write('{\n\t"class":"org.apache.solr.ltr.model.LinearModel",\n')
                convertedOutFile.write('\t"store": "' + str(featureStoreName) + '",\n')
                convertedOutFile.write('\t"name": "' + str(modelName) + '",\n')
                convertedOutFile.write('\t"features": [\n')
                isFirst = True;
                for featKey in self.featureNameToId.keys():
                    convertedOutFile.write('\t\t{ "name":"' + featKey  + '"}' if isFirst else ',\n\t\t{ "name":"' + featKey  + '"}' );
                    isFirst = False;
                convertedOutFile.write("\n\t],\n");
                convertedOutFile.write('\t"params": {\n\t\t"weights": {\n');

                startReading = False
                isFirst = True
                counter = 1
                for line in inFile:
                    if startReading:
                        newParamVal = float(line.strip())
                        if not isFirst:
                            convertedOutFile.write(',\n\t\t\t"' + self.featureIdToName[counter] + '":' + str(newParamVal))
                        else:
                            convertedOutFile.write('\t\t\t"' + self.featureIdToName[counter] + '":' + str(newParamVal))
                            isFirst = False
                        counter += 1
                    elif line.strip() == 'w':
                        startReading = True
                convertedOutFile.write('\n\t\t}\n\t}\n}')

def _writeRankSVMPairs(listOfFeatures,output):
    '''Given a list of (relevance, {Features Map}) where the list represents
    a set of documents to be compared, this calculates all pairs and
    writes the Feature Vectors in a format compatible with libSVM.
    Ex: listOfFeatures = [
      #(relevance, {feature1:value, featureN:value})
      (4, {1:0.9, 2:0.9, 3:0.1})
      (3, {1:0.7, 2:0.9, 3:0.2})
      (1, {1:0.1, 2:0.9, 6:0.1})
    ]
    '''
    for d1 in range(0,len(listOfFeatures)):
        for d2 in range(d1+1,len(listOfFeatures)):
            doc1,doc2 = listOfFeatures[d1], listOfFeatures[d2]
            fv1,fv2 = doc1[1],doc2[1]
            d1Relevance, d2Relevance = float(doc1[0]),float(doc2[0])
            if  d1Relevance - d2Relevance > PAIRWISE_THRESHOLD:#d1Relevance > d2Relevance
                outputLibSvmLine("+1",subtractFvMap(fv1,fv2),output);
                outputLibSvmLine("-1",subtractFvMap(fv2,fv1),output);
            elif d1Relevance - d2Relevance < -PAIRWISE_THRESHOLD: #d1Relevance < d2Relevance:
                outputLibSvmLine("+1",subtractFvMap(fv2,fv1),output);
                outputLibSvmLine("-1",subtractFvMap(fv1,fv2),output);
            else: #Must be approximately equal relevance, in which case this is a useless signal and we should skip
                continue;

def subtractFvMap(fv1,fv2):
    '''returns the fv from fv1 - fv2'''
    retFv = fv1.copy();
    for featInd in fv2.keys():
        subVal = 0.0;
        if featInd in fv1:
            subVal = fv1[featInd] - fv2[featInd]
        else:
            subVal = -fv2[featInd]
        if abs(subVal) > FEATURE_DIFF_THRESHOLD: #This ensures everything is in sparse format, and removes useless signals
            retFv[featInd] = subVal;
        else:
            retFv.pop(featInd, None)
    return retFv;

def outputLibSvmLine(sign,fvMap,outputFile):
    outputFile.write(sign)
    for feat in fvMap.keys():
        outputFile.write(" " + str(feat) + ":" + str(fvMap[feat]));
    outputFile.write("\n")

def trainLibSvm(libraryLocation,libraryOptions,trainingFileName,trainedModelFileName):
    if os.path.isfile(libraryLocation):
        call([libraryLocation, libraryOptions, trainingFileName, trainedModelFileName])
    else:
        raise Exception("NO LIBRARY FOUND: " + libraryLocation);