From 727739fc1f9901ffac6f1deb26354265ca092fe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Thu, 31 Jan 2013 12:58:40 +0000 Subject: [PATCH] SOLR-2827: RegexpBoost Update Processor git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1440940 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 2 + .../processor/RegexpBoostProcessor.java | 211 ++++++++++++++++++ .../RegexpBoostProcessorFactory.java | 52 +++++ .../conf/regex-boost-processor-test.txt | 10 + .../processor/RegexBoostProcessorTest.java | 115 ++++++++++ 5 files changed, 390 insertions(+) create mode 100644 solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java create mode 100644 solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java create mode 100644 solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt create mode 100644 solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index c88e3a238dd..8df1529ef87 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -66,6 +66,8 @@ New Features * SOLR-4043: Add ability to get success/failure responses from Collections API. (Raintung Li, Mark Miller) +* SOLR-2827: RegexpBoost Update Processor (janhoy) + Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java new file mode 100644 index 00000000000..62aae608d18 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; +import org.apache.commons.io.IOUtils; + +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.update.AddUpdateCommand; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A processor which will match content of "inputField" against regular expressions + * found in "boostFilename", and if it matches will return the corresponding boost + * value from the file and output this to "boostField" as a double value. + * If more than one pattern matches, the boosts from each are multiplied. + *

+ * A typical use case may be to match a URL against patterns to boost or deboost + * web documents based on the URL itself: + *

+ * # Format of each line: <pattern><TAB><boost>
+ * # Example:
+ * https?://my.domain.com/temp.*  0.2
+ * 
+ *

+ * Both inputField, boostField and boostFilename are mandatory parameters. + */ +public class RegexpBoostProcessor extends UpdateRequestProcessor { + + protected static final String INPUT_FIELD_PARAM = "inputField"; + protected static final String BOOST_FIELD_PARAM = "boostField"; + protected static final String BOOST_FILENAME_PARAM = "boostFilename"; + private static final String DEFAULT_INPUT_FIELDNAME = "url"; + private static final String DEFAULT_BOOST_FIELDNAME = "urlboost"; + + private static final Logger log = LoggerFactory.getLogger(RegexpBoostProcessor.class); + + private boolean enabled = true; + private String inputFieldname = DEFAULT_INPUT_FIELDNAME; + private String boostFieldname = DEFAULT_BOOST_FIELDNAME; + private String boostFilename; + private List boostEntries = new ArrayList(); + private static final String BOOST_ENTRIES_CACHE_KEY = "boost-entries"; + + RegexpBoostProcessor(SolrParams parameters, + SolrQueryRequest request, + SolrQueryResponse response, + UpdateRequestProcessor nextProcessor, + final Map sharedObjectCache) { + super(nextProcessor); + this.initParameters(parameters); + + if (this.boostFilename == null) { + log.warn("Null boost filename. Disabling processor."); + setEnabled(false); + } + + if (!isEnabled()) { + return; + } + + try { + synchronized (sharedObjectCache) { + List cachedBoostEntries = + (List) sharedObjectCache.get(BOOST_ENTRIES_CACHE_KEY); + + if (cachedBoostEntries == null) { + log.debug("No pre-cached boost entry list found, initializing new"); + InputStream is = request.getCore().getResourceLoader().openResource(boostFilename); + cachedBoostEntries = initBoostEntries(is); + sharedObjectCache.put(BOOST_ENTRIES_CACHE_KEY, cachedBoostEntries); + } else { + if (log.isDebugEnabled()) { + log.debug("Using cached boost entry list with " + cachedBoostEntries.size() + " elements."); + } + } + + this.boostEntries = cachedBoostEntries; + } + } catch (IOException ioe) { + log.warn("IOException while initializing boost entries from file " + this.boostFilename, ioe); + } + } + + private void initParameters(SolrParams parameters) { + if (parameters != null) { + this.setEnabled(parameters.getBool("enabled", true)); + this.inputFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_INPUT_FIELDNAME); + this.boostFieldname = parameters.get(BOOST_FIELD_PARAM, DEFAULT_BOOST_FIELDNAME); + this.boostFilename = parameters.get(BOOST_FILENAME_PARAM); + } + } + + private List initBoostEntries(InputStream is) throws IOException { + List newBoostEntries = new ArrayList(); + + BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))); + try { + String line = null; + while ((line = reader.readLine()) != null) { + // Remove comments + line = line.replaceAll("\\s+#.*$", ""); + line = line.replaceAll("^#.*$", ""); + + // Skip empty lines or comment lines + if (line.trim().length() == 0) { + continue; + } + + String[] fields = line.split("\\s+"); + + if (fields.length == 2) { + String regexp = fields[0]; + String boost = fields[1]; + newBoostEntries.add(new BoostEntry(Pattern.compile(regexp), Double.parseDouble(boost))); + log.debug("Read regexp " + regexp + " with boost " + boost); + } else { + log.warn("Malformed config input line: " + line + " (expected 2 fields, got " + fields.length + " fields). Skipping entry."); + continue; + } + } + } finally { + IOUtils.closeQuietly(reader); + } + + return newBoostEntries; + } + + @Override + public void processAdd(AddUpdateCommand command) throws IOException { + if (isEnabled()) { + processBoost(command); + } + super.processAdd(command); + } + + public void processBoost(AddUpdateCommand command) { + SolrInputDocument document = command.getSolrInputDocument(); + if (document.containsKey(inputFieldname)) { + String value = (String) document.getFieldValue(inputFieldname); + double boost = 1.0f; + for (BoostEntry boostEntry : boostEntries) { + if (boostEntry.getPattern().matcher(value).matches()) { + if (log.isDebugEnabled()) { + log.debug("Pattern match " + boostEntry.getPattern().pattern() + " for " + value); + } + boost = (boostEntry.getBoost() * 1000) * (boost * 1000) / 1000000; + } + } + document.setField(boostFieldname, boost); + + if (log.isDebugEnabled()) { + log.debug("Value " + boost + ", applied to field " + boostFieldname); + } + } + } + + public boolean isEnabled() { + return enabled; + } + + public void setEnabled(boolean enabled) { + this.enabled = enabled; + } + + private static class BoostEntry { + + private Pattern pattern; + private double boost; + + public BoostEntry(Pattern pattern, double d) { + this.pattern = pattern; + this.boost = d; + } + + public Pattern getPattern() { + return pattern; + } + + public double getBoost() { + return boost; + } + } +} diff --git a/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java new file mode 100644 index 00000000000..5bb22ba07ba --- /dev/null +++ b/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import java.util.HashMap; +import java.util.Map; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; + +/** + * Factory which creates RegexBoostProcessors + *

+ * The factory initializes a shared object cache which is passed to the processor + * and this way reduces rules file parsing to the first time the UpdateChain + * is initialized. + */ +public class RegexpBoostProcessorFactory extends UpdateRequestProcessorFactory { + + private SolrParams params; + private final Map sharedObjectCache = new HashMap(); + + @Override + public void init(@SuppressWarnings("rawtypes") final NamedList args) { + if (args != null) { + this.params = SolrParams.toSolrParams(args); + } + } + + @Override + public UpdateRequestProcessor getInstance(SolrQueryRequest request, + SolrQueryResponse response, + UpdateRequestProcessor nextProcessor) { + + return new RegexpBoostProcessor(this.params, request, response, nextProcessor, this.sharedObjectCache); + } +} diff --git a/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt b/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt new file mode 100644 index 00000000000..1dc0537c72b --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt @@ -0,0 +1,10 @@ +# Sample config file for RegexBoostProcessor +# This example applies boost on the "url" field to boost or deboost certain urls +# All rules are evaluated, and if several of them match, the boosts are multiplied. +# If for example one rule with boost 2.0 and one rule with boost 0.1 match, the resulting urlboost=0.2 + +https?://[^/]+/old/.* 0.1 #Comments are removed +https?://[^/]+/.*index\([0-9]\).html$ 0.5 + +# Prioritize certain sites over others +https?://www.mydomain.no/.* 1.5 \ No newline at end of file diff --git a/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java b/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java new file mode 100644 index 00000000000..2a1e0f39b9a --- /dev/null +++ b/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.core.SolrCore; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.servlet.SolrRequestParsers; +import org.apache.solr.update.AddUpdateCommand; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +public class RegexBoostProcessorTest extends SolrTestCaseJ4 { + private static RegexpBoostProcessor reProcessor; + protected static SolrRequestParsers _parser; + protected static ModifiableSolrParams parameters; + private static RegexpBoostProcessorFactory factory; + private SolrInputDocument document; + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + initCore("solrconfig.xml", "schema12.xml"); + SolrCore core = h.getCore(); + _parser = new SolrRequestParsers( null ); + SolrQueryResponse resp = null; + parameters = new ModifiableSolrParams(); + parameters.set(RegexpBoostProcessor.BOOST_FILENAME_PARAM, "regex-boost-processor-test.txt"); + parameters.set(RegexpBoostProcessor.INPUT_FIELD_PARAM, "url"); + parameters.set(RegexpBoostProcessor.BOOST_FIELD_PARAM, "urlboost"); + SolrQueryRequest req = _parser.buildRequestFrom(core, new ModifiableSolrParams(), null); + factory = new RegexpBoostProcessorFactory(); + factory.init(parameters.toNamedList()); + reProcessor = (RegexpBoostProcessor) factory.getInstance(req, resp, null); + } + + @Before + public void setUp() throws Exception { + document = new SolrInputDocument(); + super.setUp(); + } + + @Test + public void testNoBoost() throws Exception { + document.addField("id", "doc1"); + document.addField("url", "http://www.nomatch.no"); + + processAdd(document); + + assertEquals(1.0d, document.getFieldValue("urlboost")); + } + + @Test + public void testDeboostOld() throws Exception { + document.addField("id", "doc1"); + document.addField("url", "http://www.somedomain.no/old/test.html"); + + processAdd(document); + + assertEquals(0.1d, document.getFieldValue("urlboost")); + + // Test the other deboost rule + document = new SolrInputDocument(); + document.addField("id", "doc1"); + document.addField("url", "http://www.somedomain.no/foo/index(1).html"); + + processAdd(document); + + assertEquals(0.5d, document.getFieldValue("urlboost")); +} + + @Test + public void testBoostGood() throws Exception { + document.addField("id", "doc1"); + document.addField("url", "http://www.mydomain.no/fifty-percent-boost"); + + processAdd(document); + + assertEquals(1.5d, document.getFieldValue("urlboost")); + } + + @Test + public void testTwoRules() throws Exception { + document.addField("id", "doc1"); + document.addField("url", "http://www.mydomain.no/old/test.html"); + + processAdd(document); + + assertEquals(0.15d, document.getFieldValue("urlboost")); + } + + private void processAdd(SolrInputDocument doc) throws Exception { + AddUpdateCommand addCommand = new AddUpdateCommand(null); + addCommand.solrDoc = doc; + reProcessor.processAdd(addCommand); + } + +}