mirror of https://github.com/apache/lucene.git
127 lines
6.0 KiB
Python
127 lines
6.0 KiB
Python
from subprocess import call
|
|
import os
|
|
|
|
PAIRWISE_THRESHOLD = 1.e-1
|
|
FEATURE_DIFF_THRESHOLD = 1.e-6
|
|
|
|
class LibSvmFormatter:
|
|
def processQueryDocFeatureVector(self,docClickInfo,trainingFile):
|
|
'''Expects as input a sorted by queries list or generator that provides the context
|
|
for each query in a tuple composed of: (query , docId , relevance , source , featureVector).
|
|
The list of documents that are part of the same query will generate comparisons
|
|
against each other for training. '''
|
|
with open(trainingFile,"w") as output:
|
|
self.featureNameToId = {}
|
|
self.featureIdToName = {}
|
|
self.curFeatIndex = 1;
|
|
curListOfFv = []
|
|
curQueryAndSource = ""
|
|
for query,docId,relevance,source,featureVector in docClickInfo:
|
|
if curQueryAndSource != query + source:
|
|
#Time to flush out all the pairs
|
|
_writeRankSVMPairs(curListOfFv,output);
|
|
curListOfFv = []
|
|
curQueryAndSource = query + source
|
|
curListOfFv.append((relevance,self._makeFeaturesMap(featureVector)))
|
|
_writeRankSVMPairs(curListOfFv,output); #This catches the last list of comparisons
|
|
|
|
def _makeFeaturesMap(self,featureVector):
|
|
'''expects a list of strings with "feature name":"feature value" pairs. Outputs a map of map[key] = value.
|
|
Where key is now an integer. libSVM requires the key to be an integer but not all libraries have
|
|
this requirement.'''
|
|
features = {}
|
|
for keyValuePairStr in featureVector:
|
|
featName,featValue = keyValuePairStr.split("=");
|
|
features[self._getFeatureId(featName)] = float(featValue);
|
|
return features
|
|
|
|
def _getFeatureId(self,key):
|
|
if key not in self.featureNameToId:
|
|
self.featureNameToId[key] = self.curFeatIndex;
|
|
self.featureIdToName[self.curFeatIndex] = key;
|
|
self.curFeatIndex += 1;
|
|
return self.featureNameToId[key];
|
|
|
|
def convertLibSvmModelToLtrModel(self,libSvmModelLocation,outputFile,modelName,featureStoreName):
|
|
with open(libSvmModelLocation, 'r') as inFile:
|
|
with open(outputFile,'w') as convertedOutFile:
|
|
# TODO: use json module instead of direct write
|
|
convertedOutFile.write('{\n\t"class":"org.apache.solr.ltr.model.LinearModel",\n')
|
|
convertedOutFile.write('\t"store": "' + str(featureStoreName) + '",\n')
|
|
convertedOutFile.write('\t"name": "' + str(modelName) + '",\n')
|
|
convertedOutFile.write('\t"features": [\n')
|
|
isFirst = True;
|
|
for featKey in self.featureNameToId.keys():
|
|
convertedOutFile.write('\t\t{ "name":"' + featKey + '"}' if isFirst else ',\n\t\t{ "name":"' + featKey + '"}' );
|
|
isFirst = False;
|
|
convertedOutFile.write("\n\t],\n");
|
|
convertedOutFile.write('\t"params": {\n\t\t"weights": {\n');
|
|
|
|
startReading = False
|
|
isFirst = True
|
|
counter = 1
|
|
for line in inFile:
|
|
if startReading:
|
|
newParamVal = float(line.strip())
|
|
if not isFirst:
|
|
convertedOutFile.write(',\n\t\t\t"' + self.featureIdToName[counter] + '":' + str(newParamVal))
|
|
else:
|
|
convertedOutFile.write('\t\t\t"' + self.featureIdToName[counter] + '":' + str(newParamVal))
|
|
isFirst = False
|
|
counter += 1
|
|
elif line.strip() == 'w':
|
|
startReading = True
|
|
convertedOutFile.write('\n\t\t}\n\t}\n}')
|
|
|
|
def _writeRankSVMPairs(listOfFeatures,output):
|
|
'''Given a list of (relevance, {Features Map}) where the list represents
|
|
a set of documents to be compared, this calculates all pairs and
|
|
writes the Feature Vectors in a format compatible with libSVM.
|
|
Ex: listOfFeatures = [
|
|
#(relevance, {feature1:value, featureN:value})
|
|
(4, {1:0.9, 2:0.9, 3:0.1})
|
|
(3, {1:0.7, 2:0.9, 3:0.2})
|
|
(1, {1:0.1, 2:0.9, 6:0.1})
|
|
]
|
|
'''
|
|
for d1 in range(0,len(listOfFeatures)):
|
|
for d2 in range(d1+1,len(listOfFeatures)):
|
|
doc1,doc2 = listOfFeatures[d1], listOfFeatures[d2]
|
|
fv1,fv2 = doc1[1],doc2[1]
|
|
d1Relevance, d2Relevance = float(doc1[0]),float(doc2[0])
|
|
if d1Relevance - d2Relevance > PAIRWISE_THRESHOLD:#d1Relevance > d2Relevance
|
|
outputLibSvmLine("+1",subtractFvMap(fv1,fv2),output);
|
|
outputLibSvmLine("-1",subtractFvMap(fv2,fv1),output);
|
|
elif d1Relevance - d2Relevance < -PAIRWISE_THRESHOLD: #d1Relevance < d2Relevance:
|
|
outputLibSvmLine("+1",subtractFvMap(fv2,fv1),output);
|
|
outputLibSvmLine("-1",subtractFvMap(fv1,fv2),output);
|
|
else: #Must be approximately equal relevance, in which case this is a useless signal and we should skip
|
|
continue;
|
|
|
|
def subtractFvMap(fv1,fv2):
|
|
'''returns the fv from fv1 - fv2'''
|
|
retFv = fv1.copy();
|
|
for featInd in fv2.keys():
|
|
subVal = 0.0;
|
|
if featInd in fv1:
|
|
subVal = fv1[featInd] - fv2[featInd]
|
|
else:
|
|
subVal = -fv2[featInd]
|
|
if abs(subVal) > FEATURE_DIFF_THRESHOLD: #This ensures everything is in sparse format, and removes useless signals
|
|
retFv[featInd] = subVal;
|
|
else:
|
|
retFv.pop(featInd, None)
|
|
return retFv;
|
|
|
|
def outputLibSvmLine(sign,fvMap,outputFile):
|
|
outputFile.write(sign)
|
|
for feat in fvMap.keys():
|
|
outputFile.write(" " + str(feat) + ":" + str(fvMap[feat]));
|
|
outputFile.write("\n")
|
|
|
|
def trainLibSvm(libraryLocation,libraryOptions,trainingFileName,trainedModelFileName):
|
|
if os.path.isfile(libraryLocation):
|
|
call([libraryLocation, libraryOptions, trainingFileName, trainedModelFileName])
|
|
else:
|
|
raise Exception("NO LIBRARY FOUND: " + libraryLocation);
|