SOLR-2827: RegexpBoost Update Processor

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1440940 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jan Høydahl 2013-01-31 12:58:40 +00:00
parent 398b80b6de
commit 727739fc1f
5 changed files with 390 additions and 0 deletions

View File

@ -66,6 +66,8 @@ New Features
* SOLR-4043: Add ability to get success/failure responses from Collections API. * SOLR-4043: Add ability to get success/failure responses from Collections API.
(Raintung Li, Mark Miller) (Raintung Li, Mark Miller)
* SOLR-2827: RegexpBoost Update Processor (janhoy)
Bug Fixes Bug Fixes
---------------------- ----------------------

View File

@ -0,0 +1,211 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A processor which will match content of "inputField" against regular expressions
* found in "boostFilename", and if it matches will return the corresponding boost
* value from the file and output this to "boostField" as a double value.
* If more than one pattern matches, the boosts from each are multiplied.
* <p>
* A typical use case may be to match a URL against patterns to boost or deboost
* web documents based on the URL itself:
* <pre>
* # Format of each line: &lt;pattern&gt;&lt;TAB&gt;&lt;boost&gt;
* # Example:
* https?://my.domain.com/temp.* 0.2
* </pre>
* <p>
* Both inputField, boostField and boostFilename are mandatory parameters.
*/
public class RegexpBoostProcessor extends UpdateRequestProcessor {
protected static final String INPUT_FIELD_PARAM = "inputField";
protected static final String BOOST_FIELD_PARAM = "boostField";
protected static final String BOOST_FILENAME_PARAM = "boostFilename";
private static final String DEFAULT_INPUT_FIELDNAME = "url";
private static final String DEFAULT_BOOST_FIELDNAME = "urlboost";
private static final Logger log = LoggerFactory.getLogger(RegexpBoostProcessor.class);
private boolean enabled = true;
private String inputFieldname = DEFAULT_INPUT_FIELDNAME;
private String boostFieldname = DEFAULT_BOOST_FIELDNAME;
private String boostFilename;
private List<BoostEntry> boostEntries = new ArrayList<BoostEntry>();
private static final String BOOST_ENTRIES_CACHE_KEY = "boost-entries";
RegexpBoostProcessor(SolrParams parameters,
SolrQueryRequest request,
SolrQueryResponse response,
UpdateRequestProcessor nextProcessor,
final Map<Object, Object> sharedObjectCache) {
super(nextProcessor);
this.initParameters(parameters);
if (this.boostFilename == null) {
log.warn("Null boost filename. Disabling processor.");
setEnabled(false);
}
if (!isEnabled()) {
return;
}
try {
synchronized (sharedObjectCache) {
List<BoostEntry> cachedBoostEntries =
(List<BoostEntry>) sharedObjectCache.get(BOOST_ENTRIES_CACHE_KEY);
if (cachedBoostEntries == null) {
log.debug("No pre-cached boost entry list found, initializing new");
InputStream is = request.getCore().getResourceLoader().openResource(boostFilename);
cachedBoostEntries = initBoostEntries(is);
sharedObjectCache.put(BOOST_ENTRIES_CACHE_KEY, cachedBoostEntries);
} else {
if (log.isDebugEnabled()) {
log.debug("Using cached boost entry list with " + cachedBoostEntries.size() + " elements.");
}
}
this.boostEntries = cachedBoostEntries;
}
} catch (IOException ioe) {
log.warn("IOException while initializing boost entries from file " + this.boostFilename, ioe);
}
}
private void initParameters(SolrParams parameters) {
if (parameters != null) {
this.setEnabled(parameters.getBool("enabled", true));
this.inputFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_INPUT_FIELDNAME);
this.boostFieldname = parameters.get(BOOST_FIELD_PARAM, DEFAULT_BOOST_FIELDNAME);
this.boostFilename = parameters.get(BOOST_FILENAME_PARAM);
}
}
private List<BoostEntry> initBoostEntries(InputStream is) throws IOException {
List<BoostEntry> newBoostEntries = new ArrayList<BoostEntry>();
BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
try {
String line = null;
while ((line = reader.readLine()) != null) {
// Remove comments
line = line.replaceAll("\\s+#.*$", "");
line = line.replaceAll("^#.*$", "");
// Skip empty lines or comment lines
if (line.trim().length() == 0) {
continue;
}
String[] fields = line.split("\\s+");
if (fields.length == 2) {
String regexp = fields[0];
String boost = fields[1];
newBoostEntries.add(new BoostEntry(Pattern.compile(regexp), Double.parseDouble(boost)));
log.debug("Read regexp " + regexp + " with boost " + boost);
} else {
log.warn("Malformed config input line: " + line + " (expected 2 fields, got " + fields.length + " fields). Skipping entry.");
continue;
}
}
} finally {
IOUtils.closeQuietly(reader);
}
return newBoostEntries;
}
@Override
public void processAdd(AddUpdateCommand command) throws IOException {
if (isEnabled()) {
processBoost(command);
}
super.processAdd(command);
}
public void processBoost(AddUpdateCommand command) {
SolrInputDocument document = command.getSolrInputDocument();
if (document.containsKey(inputFieldname)) {
String value = (String) document.getFieldValue(inputFieldname);
double boost = 1.0f;
for (BoostEntry boostEntry : boostEntries) {
if (boostEntry.getPattern().matcher(value).matches()) {
if (log.isDebugEnabled()) {
log.debug("Pattern match " + boostEntry.getPattern().pattern() + " for " + value);
}
boost = (boostEntry.getBoost() * 1000) * (boost * 1000) / 1000000;
}
}
document.setField(boostFieldname, boost);
if (log.isDebugEnabled()) {
log.debug("Value " + boost + ", applied to field " + boostFieldname);
}
}
}
public boolean isEnabled() {
return enabled;
}
public void setEnabled(boolean enabled) {
this.enabled = enabled;
}
private static class BoostEntry {
private Pattern pattern;
private double boost;
public BoostEntry(Pattern pattern, double d) {
this.pattern = pattern;
this.boost = d;
}
public Pattern getPattern() {
return pattern;
}
public double getBoost() {
return boost;
}
}
}

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
/**
* Factory which creates RegexBoostProcessors
* <p>
* The factory initializes a shared object cache which is passed to the processor
* and this way reduces rules file parsing to the first time the UpdateChain
* is initialized.
*/
public class RegexpBoostProcessorFactory extends UpdateRequestProcessorFactory {
private SolrParams params;
private final Map<Object, Object> sharedObjectCache = new HashMap<Object, Object>();
@Override
public void init(@SuppressWarnings("rawtypes") final NamedList args) {
if (args != null) {
this.params = SolrParams.toSolrParams(args);
}
}
@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest request,
SolrQueryResponse response,
UpdateRequestProcessor nextProcessor) {
return new RegexpBoostProcessor(this.params, request, response, nextProcessor, this.sharedObjectCache);
}
}

View File

@ -0,0 +1,10 @@
# Sample config file for RegexBoostProcessor
# This example applies boost on the "url" field to boost or deboost certain urls
# All rules are evaluated, and if several of them match, the boosts are multiplied.
# If for example one rule with boost 2.0 and one rule with boost 0.1 match, the resulting urlboost=0.2
https?://[^/]+/old/.* 0.1 #Comments are removed
https?://[^/]+/.*index\([0-9]\).html$ 0.5
# Prioritize certain sites over others
https?://www.mydomain.no/.* 1.5

View File

@ -0,0 +1,115 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.servlet.SolrRequestParsers;
import org.apache.solr.update.AddUpdateCommand;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
public class RegexBoostProcessorTest extends SolrTestCaseJ4 {
private static RegexpBoostProcessor reProcessor;
protected static SolrRequestParsers _parser;
protected static ModifiableSolrParams parameters;
private static RegexpBoostProcessorFactory factory;
private SolrInputDocument document;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
initCore("solrconfig.xml", "schema12.xml");
SolrCore core = h.getCore();
_parser = new SolrRequestParsers( null );
SolrQueryResponse resp = null;
parameters = new ModifiableSolrParams();
parameters.set(RegexpBoostProcessor.BOOST_FILENAME_PARAM, "regex-boost-processor-test.txt");
parameters.set(RegexpBoostProcessor.INPUT_FIELD_PARAM, "url");
parameters.set(RegexpBoostProcessor.BOOST_FIELD_PARAM, "urlboost");
SolrQueryRequest req = _parser.buildRequestFrom(core, new ModifiableSolrParams(), null);
factory = new RegexpBoostProcessorFactory();
factory.init(parameters.toNamedList());
reProcessor = (RegexpBoostProcessor) factory.getInstance(req, resp, null);
}
@Before
public void setUp() throws Exception {
document = new SolrInputDocument();
super.setUp();
}
@Test
public void testNoBoost() throws Exception {
document.addField("id", "doc1");
document.addField("url", "http://www.nomatch.no");
processAdd(document);
assertEquals(1.0d, document.getFieldValue("urlboost"));
}
@Test
public void testDeboostOld() throws Exception {
document.addField("id", "doc1");
document.addField("url", "http://www.somedomain.no/old/test.html");
processAdd(document);
assertEquals(0.1d, document.getFieldValue("urlboost"));
// Test the other deboost rule
document = new SolrInputDocument();
document.addField("id", "doc1");
document.addField("url", "http://www.somedomain.no/foo/index(1).html");
processAdd(document);
assertEquals(0.5d, document.getFieldValue("urlboost"));
}
@Test
public void testBoostGood() throws Exception {
document.addField("id", "doc1");
document.addField("url", "http://www.mydomain.no/fifty-percent-boost");
processAdd(document);
assertEquals(1.5d, document.getFieldValue("urlboost"));
}
@Test
public void testTwoRules() throws Exception {
document.addField("id", "doc1");
document.addField("url", "http://www.mydomain.no/old/test.html");
processAdd(document);
assertEquals(0.15d, document.getFieldValue("urlboost"));
}
private void processAdd(SolrInputDocument doc) throws Exception {
AddUpdateCommand addCommand = new AddUpdateCommand(null);
addCommand.solrDoc = doc;
reProcessor.processAdd(addCommand);
}
}