#!/usr/bin/env python import sys import json import httplib import urllib import libsvm_formatter from optparse import OptionParser solrQueryUrl = "" def setupSolr(collection, host, port, featuresFile, featureStoreName): '''Sets up solr with the proper features for the test''' conn = httplib.HTTPConnection(host, port) baseUrl = "/solr/" + collection featureUrl = baseUrl + "/schema/feature-store" conn.request("DELETE", featureUrl+"/"+featureStoreName) r = conn.getresponse() msg = r.read() if (r.status != httplib.OK and r.status != httplib.CREATED and r.status != httplib.ACCEPTED and r.status != httplib.NOT_FOUND): raise Exception("Status: {0} {1}\nResponse: {2}".format(r.status, r.reason, msg)) # Add features headers = {'Content-type': 'application/json'} featuresBody = open(featuresFile) conn.request("POST", featureUrl, featuresBody, headers) r = conn.getresponse() msg = r.read() if (r.status != httplib.OK and r.status != httplib.ACCEPTED): print r.status print "" print r.reason; raise Exception("Status: {0} {1}\nResponse: {2}".format(r.status, r.reason, msg)) conn.close() def generateQueries(userQueriesFile, collection, requestHandler, solrFeatureStoreName, efiParams): with open(userQueriesFile) as input: solrQueryUrls = [] #A list of tuples with solrQueryUrl,solrQuery,docId,scoreForPQ,source for line in input: line = line.strip(); searchText,docId,score,source = line.split("|"); solrQuery = generateHttpRequest(collection,requestHandler,solrFeatureStoreName,efiParams,searchText,docId) solrQueryUrls.append((solrQuery,searchText,docId,score,source)) return solrQueryUrls; def generateHttpRequest(collection, requestHandler, solrFeatureStoreName, efiParams, searchText, docId): global solrQueryUrl if len(solrQueryUrl) < 1: solrQueryUrl = "/".join([ "", "solr", collection, requestHandler ]) solrQueryUrl += ("?fl=" + ",".join([ "id", "score", "[features store="+solrFeatureStoreName+" "+efiParams+"]" ])) solrQueryUrl += "&q=" solrQueryUrl = solrQueryUrl.replace(" ","+") solrQueryUrl += urllib.quote_plus("id:") userQuery = urllib.quote_plus(searchText.strip().replace("'","\\'").replace("/","\\\\/")) solrQuery = solrQueryUrl + '"' + urllib.quote_plus(docId) + '"' #+ solrQueryUrlEnd solrQuery = solrQuery.replace("%24USERQUERY", userQuery).replace('$USERQUERY', urllib.quote_plus("\\'" + userQuery + "\\'")) return solrQuery def generateTrainingData(solrQueries, host, port): '''Given a list of solr queries, yields a tuple of query , docId , score , source , feature vector for each query. Feature Vector is a list of strings of form "key=value"''' conn = httplib.HTTPConnection(host, port) headers = {"Connection":" keep-alive"} try: for queryUrl,query,docId,score,source in solrQueries: conn.request("GET", queryUrl, headers=headers) r = conn.getresponse() msg = r.read() msgDict = json.loads(msg) fv = "" docs = msgDict['response']['docs'] if len(docs) > 0 and "[features]" in docs[0]: if not msgDict['response']['docs'][0]["[features]"] == None: fv = msgDict['response']['docs'][0]["[features]"]; else: print "ERROR NULL FV FOR: " + docId; print msg continue; else: print "ERROR FOR: " + docId; print msg continue; if r.status == httplib.OK: #print "http connection was ok for: " + queryUrl yield(query,docId,score,source,fv.split(",")); else: raise Exception("Status: {0} {1}\nResponse: {2}".format(r.status, r.reason, msg)) except Exception as e: print msg print e conn.close() def uploadModel(collection, host, port, modelFile, modelName): modelUrl = "/solr/" + collection + "/schema/model-store" headers = {'Content-type': 'application/json'} with open(modelFile) as modelBody: conn = httplib.HTTPConnection(host, port) conn.request("DELETE", modelUrl+"/"+modelName) r = conn.getresponse() msg = r.read() if (r.status != httplib.OK and r.status != httplib.CREATED and r.status != httplib.ACCEPTED and r.status != httplib.NOT_FOUND): raise Exception("Status: {0} {1}\nResponse: {2}".format(r.status, r.reason, msg)) conn.request("POST", modelUrl, modelBody, headers) r = conn.getresponse() msg = r.read() if (r.status != httplib.OK and r.status != httplib.CREATED and r.status != httplib.ACCEPTED): raise Exception("Status: {0} {1}\nResponse: {2}".format(r.status, r.reason, msg)) def main(argv=None): if argv is None: argv = sys.argv parser = OptionParser(usage="usage: %prog [options] ", version="%prog 1.0") parser.add_option('-c', '--config', dest='configFile', help='File of configuration for the test') (options, args) = parser.parse_args() if options.configFile == None: parser.print_help() return 1 with open(options.configFile) as configFile: config = json.load(configFile) print "Uploading features ("+config["solrFeaturesFile"]+") to Solr" setupSolr(config["collection"], config["host"], config["port"], config["solrFeaturesFile"], config["solrFeatureStoreName"]) print "Converting user queries ("+config["userQueriesFile"]+") into Solr queries for feature extraction" reRankQueries = generateQueries(config["userQueriesFile"], config["collection"], config["requestHandler"], config["solrFeatureStoreName"], config["efiParams"]) print "Running Solr queries to extract features" fvGenerator = generateTrainingData(reRankQueries, config["host"], config["port"]) formatter = libsvm_formatter.LibSvmFormatter(); formatter.processQueryDocFeatureVector(fvGenerator,config["trainingFile"]); print "Training model using '"+config["trainingLibraryLocation"]+" "+config["trainingLibraryOptions"]+"'" libsvm_formatter.trainLibSvm(config["trainingLibraryLocation"],config["trainingLibraryOptions"],config["trainingFile"],config["trainedModelFile"]) print "Converting trained model ("+config["trainedModelFile"]+") to solr model ("+config["solrModelFile"]+")" formatter.convertLibSvmModelToLtrModel(config["trainedModelFile"], config["solrModelFile"], config["solrModelName"], config["solrFeatureStoreName"]) print "Uploading model ("+config["solrModelFile"]+") to Solr" uploadModel(config["collection"], config["host"], config["port"], config["solrModelFile"], config["solrModelName"]) if __name__ == '__main__': sys.exit(main())