mirror of https://github.com/apache/lucene.git
SOLR-2827: RegexpBoost Update Processor
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1440940 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
398b80b6de
commit
727739fc1f
|
@ -66,6 +66,8 @@ New Features
|
||||||
* SOLR-4043: Add ability to get success/failure responses from Collections API.
|
* SOLR-4043: Add ability to get success/failure responses from Collections API.
|
||||||
(Raintung Li, Mark Miller)
|
(Raintung Li, Mark Miller)
|
||||||
|
|
||||||
|
* SOLR-2827: RegexpBoost Update Processor (janhoy)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,211 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.solr.update.processor;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
||||||
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
|
import org.apache.solr.common.params.SolrParams;
|
||||||
|
import org.apache.solr.request.SolrQueryRequest;
|
||||||
|
import org.apache.solr.response.SolrQueryResponse;
|
||||||
|
import org.apache.solr.update.AddUpdateCommand;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A processor which will match content of "inputField" against regular expressions
|
||||||
|
* found in "boostFilename", and if it matches will return the corresponding boost
|
||||||
|
* value from the file and output this to "boostField" as a double value.
|
||||||
|
* If more than one pattern matches, the boosts from each are multiplied.
|
||||||
|
* <p>
|
||||||
|
* A typical use case may be to match a URL against patterns to boost or deboost
|
||||||
|
* web documents based on the URL itself:
|
||||||
|
* <pre>
|
||||||
|
* # Format of each line: <pattern><TAB><boost>
|
||||||
|
* # Example:
|
||||||
|
* https?://my.domain.com/temp.* 0.2
|
||||||
|
* </pre>
|
||||||
|
* <p>
|
||||||
|
* Both inputField, boostField and boostFilename are mandatory parameters.
|
||||||
|
*/
|
||||||
|
public class RegexpBoostProcessor extends UpdateRequestProcessor {
|
||||||
|
|
||||||
|
protected static final String INPUT_FIELD_PARAM = "inputField";
|
||||||
|
protected static final String BOOST_FIELD_PARAM = "boostField";
|
||||||
|
protected static final String BOOST_FILENAME_PARAM = "boostFilename";
|
||||||
|
private static final String DEFAULT_INPUT_FIELDNAME = "url";
|
||||||
|
private static final String DEFAULT_BOOST_FIELDNAME = "urlboost";
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(RegexpBoostProcessor.class);
|
||||||
|
|
||||||
|
private boolean enabled = true;
|
||||||
|
private String inputFieldname = DEFAULT_INPUT_FIELDNAME;
|
||||||
|
private String boostFieldname = DEFAULT_BOOST_FIELDNAME;
|
||||||
|
private String boostFilename;
|
||||||
|
private List<BoostEntry> boostEntries = new ArrayList<BoostEntry>();
|
||||||
|
private static final String BOOST_ENTRIES_CACHE_KEY = "boost-entries";
|
||||||
|
|
||||||
|
RegexpBoostProcessor(SolrParams parameters,
|
||||||
|
SolrQueryRequest request,
|
||||||
|
SolrQueryResponse response,
|
||||||
|
UpdateRequestProcessor nextProcessor,
|
||||||
|
final Map<Object, Object> sharedObjectCache) {
|
||||||
|
super(nextProcessor);
|
||||||
|
this.initParameters(parameters);
|
||||||
|
|
||||||
|
if (this.boostFilename == null) {
|
||||||
|
log.warn("Null boost filename. Disabling processor.");
|
||||||
|
setEnabled(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isEnabled()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
synchronized (sharedObjectCache) {
|
||||||
|
List<BoostEntry> cachedBoostEntries =
|
||||||
|
(List<BoostEntry>) sharedObjectCache.get(BOOST_ENTRIES_CACHE_KEY);
|
||||||
|
|
||||||
|
if (cachedBoostEntries == null) {
|
||||||
|
log.debug("No pre-cached boost entry list found, initializing new");
|
||||||
|
InputStream is = request.getCore().getResourceLoader().openResource(boostFilename);
|
||||||
|
cachedBoostEntries = initBoostEntries(is);
|
||||||
|
sharedObjectCache.put(BOOST_ENTRIES_CACHE_KEY, cachedBoostEntries);
|
||||||
|
} else {
|
||||||
|
if (log.isDebugEnabled()) {
|
||||||
|
log.debug("Using cached boost entry list with " + cachedBoostEntries.size() + " elements.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.boostEntries = cachedBoostEntries;
|
||||||
|
}
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
log.warn("IOException while initializing boost entries from file " + this.boostFilename, ioe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initParameters(SolrParams parameters) {
|
||||||
|
if (parameters != null) {
|
||||||
|
this.setEnabled(parameters.getBool("enabled", true));
|
||||||
|
this.inputFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_INPUT_FIELDNAME);
|
||||||
|
this.boostFieldname = parameters.get(BOOST_FIELD_PARAM, DEFAULT_BOOST_FIELDNAME);
|
||||||
|
this.boostFilename = parameters.get(BOOST_FILENAME_PARAM);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<BoostEntry> initBoostEntries(InputStream is) throws IOException {
|
||||||
|
List<BoostEntry> newBoostEntries = new ArrayList<BoostEntry>();
|
||||||
|
|
||||||
|
BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
|
||||||
|
try {
|
||||||
|
String line = null;
|
||||||
|
while ((line = reader.readLine()) != null) {
|
||||||
|
// Remove comments
|
||||||
|
line = line.replaceAll("\\s+#.*$", "");
|
||||||
|
line = line.replaceAll("^#.*$", "");
|
||||||
|
|
||||||
|
// Skip empty lines or comment lines
|
||||||
|
if (line.trim().length() == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] fields = line.split("\\s+");
|
||||||
|
|
||||||
|
if (fields.length == 2) {
|
||||||
|
String regexp = fields[0];
|
||||||
|
String boost = fields[1];
|
||||||
|
newBoostEntries.add(new BoostEntry(Pattern.compile(regexp), Double.parseDouble(boost)));
|
||||||
|
log.debug("Read regexp " + regexp + " with boost " + boost);
|
||||||
|
} else {
|
||||||
|
log.warn("Malformed config input line: " + line + " (expected 2 fields, got " + fields.length + " fields). Skipping entry.");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
IOUtils.closeQuietly(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
return newBoostEntries;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void processAdd(AddUpdateCommand command) throws IOException {
|
||||||
|
if (isEnabled()) {
|
||||||
|
processBoost(command);
|
||||||
|
}
|
||||||
|
super.processAdd(command);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void processBoost(AddUpdateCommand command) {
|
||||||
|
SolrInputDocument document = command.getSolrInputDocument();
|
||||||
|
if (document.containsKey(inputFieldname)) {
|
||||||
|
String value = (String) document.getFieldValue(inputFieldname);
|
||||||
|
double boost = 1.0f;
|
||||||
|
for (BoostEntry boostEntry : boostEntries) {
|
||||||
|
if (boostEntry.getPattern().matcher(value).matches()) {
|
||||||
|
if (log.isDebugEnabled()) {
|
||||||
|
log.debug("Pattern match " + boostEntry.getPattern().pattern() + " for " + value);
|
||||||
|
}
|
||||||
|
boost = (boostEntry.getBoost() * 1000) * (boost * 1000) / 1000000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
document.setField(boostFieldname, boost);
|
||||||
|
|
||||||
|
if (log.isDebugEnabled()) {
|
||||||
|
log.debug("Value " + boost + ", applied to field " + boostFieldname);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEnabled() {
|
||||||
|
return enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEnabled(boolean enabled) {
|
||||||
|
this.enabled = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class BoostEntry {
|
||||||
|
|
||||||
|
private Pattern pattern;
|
||||||
|
private double boost;
|
||||||
|
|
||||||
|
public BoostEntry(Pattern pattern, double d) {
|
||||||
|
this.pattern = pattern;
|
||||||
|
this.boost = d;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pattern getPattern() {
|
||||||
|
return pattern;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getBoost() {
|
||||||
|
return boost;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.solr.update.processor;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.apache.solr.common.params.SolrParams;
|
||||||
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
import org.apache.solr.request.SolrQueryRequest;
|
||||||
|
import org.apache.solr.response.SolrQueryResponse;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory which creates RegexBoostProcessors
|
||||||
|
* <p>
|
||||||
|
* The factory initializes a shared object cache which is passed to the processor
|
||||||
|
* and this way reduces rules file parsing to the first time the UpdateChain
|
||||||
|
* is initialized.
|
||||||
|
*/
|
||||||
|
public class RegexpBoostProcessorFactory extends UpdateRequestProcessorFactory {
|
||||||
|
|
||||||
|
private SolrParams params;
|
||||||
|
private final Map<Object, Object> sharedObjectCache = new HashMap<Object, Object>();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void init(@SuppressWarnings("rawtypes") final NamedList args) {
|
||||||
|
if (args != null) {
|
||||||
|
this.params = SolrParams.toSolrParams(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public UpdateRequestProcessor getInstance(SolrQueryRequest request,
|
||||||
|
SolrQueryResponse response,
|
||||||
|
UpdateRequestProcessor nextProcessor) {
|
||||||
|
|
||||||
|
return new RegexpBoostProcessor(this.params, request, response, nextProcessor, this.sharedObjectCache);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,10 @@
|
||||||
|
# Sample config file for RegexBoostProcessor
|
||||||
|
# This example applies boost on the "url" field to boost or deboost certain urls
|
||||||
|
# All rules are evaluated, and if several of them match, the boosts are multiplied.
|
||||||
|
# If for example one rule with boost 2.0 and one rule with boost 0.1 match, the resulting urlboost=0.2
|
||||||
|
|
||||||
|
https?://[^/]+/old/.* 0.1 #Comments are removed
|
||||||
|
https?://[^/]+/.*index\([0-9]\).html$ 0.5
|
||||||
|
|
||||||
|
# Prioritize certain sites over others
|
||||||
|
https?://www.mydomain.no/.* 1.5
|
|
@ -0,0 +1,115 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.solr.update.processor;
|
||||||
|
|
||||||
|
import org.apache.solr.SolrTestCaseJ4;
|
||||||
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
|
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||||
|
import org.apache.solr.core.SolrCore;
|
||||||
|
import org.apache.solr.request.SolrQueryRequest;
|
||||||
|
import org.apache.solr.response.SolrQueryResponse;
|
||||||
|
import org.apache.solr.servlet.SolrRequestParsers;
|
||||||
|
import org.apache.solr.update.AddUpdateCommand;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class RegexBoostProcessorTest extends SolrTestCaseJ4 {
|
||||||
|
private static RegexpBoostProcessor reProcessor;
|
||||||
|
protected static SolrRequestParsers _parser;
|
||||||
|
protected static ModifiableSolrParams parameters;
|
||||||
|
private static RegexpBoostProcessorFactory factory;
|
||||||
|
private SolrInputDocument document;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void setUpBeforeClass() throws Exception {
|
||||||
|
initCore("solrconfig.xml", "schema12.xml");
|
||||||
|
SolrCore core = h.getCore();
|
||||||
|
_parser = new SolrRequestParsers( null );
|
||||||
|
SolrQueryResponse resp = null;
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.set(RegexpBoostProcessor.BOOST_FILENAME_PARAM, "regex-boost-processor-test.txt");
|
||||||
|
parameters.set(RegexpBoostProcessor.INPUT_FIELD_PARAM, "url");
|
||||||
|
parameters.set(RegexpBoostProcessor.BOOST_FIELD_PARAM, "urlboost");
|
||||||
|
SolrQueryRequest req = _parser.buildRequestFrom(core, new ModifiableSolrParams(), null);
|
||||||
|
factory = new RegexpBoostProcessorFactory();
|
||||||
|
factory.init(parameters.toNamedList());
|
||||||
|
reProcessor = (RegexpBoostProcessor) factory.getInstance(req, resp, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
document = new SolrInputDocument();
|
||||||
|
super.setUp();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNoBoost() throws Exception {
|
||||||
|
document.addField("id", "doc1");
|
||||||
|
document.addField("url", "http://www.nomatch.no");
|
||||||
|
|
||||||
|
processAdd(document);
|
||||||
|
|
||||||
|
assertEquals(1.0d, document.getFieldValue("urlboost"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDeboostOld() throws Exception {
|
||||||
|
document.addField("id", "doc1");
|
||||||
|
document.addField("url", "http://www.somedomain.no/old/test.html");
|
||||||
|
|
||||||
|
processAdd(document);
|
||||||
|
|
||||||
|
assertEquals(0.1d, document.getFieldValue("urlboost"));
|
||||||
|
|
||||||
|
// Test the other deboost rule
|
||||||
|
document = new SolrInputDocument();
|
||||||
|
document.addField("id", "doc1");
|
||||||
|
document.addField("url", "http://www.somedomain.no/foo/index(1).html");
|
||||||
|
|
||||||
|
processAdd(document);
|
||||||
|
|
||||||
|
assertEquals(0.5d, document.getFieldValue("urlboost"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBoostGood() throws Exception {
|
||||||
|
document.addField("id", "doc1");
|
||||||
|
document.addField("url", "http://www.mydomain.no/fifty-percent-boost");
|
||||||
|
|
||||||
|
processAdd(document);
|
||||||
|
|
||||||
|
assertEquals(1.5d, document.getFieldValue("urlboost"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTwoRules() throws Exception {
|
||||||
|
document.addField("id", "doc1");
|
||||||
|
document.addField("url", "http://www.mydomain.no/old/test.html");
|
||||||
|
|
||||||
|
processAdd(document);
|
||||||
|
|
||||||
|
assertEquals(0.15d, document.getFieldValue("urlboost"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processAdd(SolrInputDocument doc) throws Exception {
|
||||||
|
AddUpdateCommand addCommand = new AddUpdateCommand(null);
|
||||||
|
addCommand.solrDoc = doc;
|
||||||
|
reProcessor.processAdd(addCommand);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue