SOLR-3670: New CountFieldValuesUpdateProcessorFactory

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1372687 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2012-08-14 00:35:55 +00:00
parent e64057ccd9
commit 680449363f
4 changed files with 176 additions and 0 deletions

View File

@ -31,6 +31,15 @@ Upgrading from Solr 4.0.0-BETA
In order to better support distributed search mode, the TermVectorComponent's response format has been changed so that if the schema defines a uniqueKeyField, then that field value is used as the "key" for each document in it's response section, instead of the internal lucene doc id. Users w/o a uniqueKeyField will continue to see the same response format. See SOLR-3229 for more details.
Detailed Change List
----------------------
New Features
----------------------
* SOLR-3670: New CountFieldValuesUpdateProcessorFactory makes it easy to index
the number of values in another field for later use at query time. (hossman)
Optimizations
----------------------

View File

@ -0,0 +1,95 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TextField;
import org.apache.solr.schema.StrField;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.commons.lang.StringUtils;
/**
* <p>
* Replaces any list of values for a field matching the specified
* conditions with the the count of the number of values for that field.
* </p>
* <p>
* By default, this processor matches no fields.
* </p>
* <p>
* The typical use case for this processor would be in combination with the
* {@link CloneFieldUpdateProcessorFactory} so that it's possible to query by
* the quantity of values in the source field.
* <p>
* For example, in the configuration below, the end result will be that the
* <code>category_count</code> field can be used to search for documents based
* on how many values they contain in the <code>category</code> field.
* </p>
*
* <pre class="prettyprint">
* &lt;updateRequestProcessorChain&gt;
* &lt;processor class="solr.CloneFieldUpdateProcessorFactory"&gt;
* &lt;str name="source"&gt;category&lt;/str&gt;
* &lt;str name="dest"&gt;category_count&lt;/str&gt;
* &lt;/processor&gt;
* &lt;processor class="solr.CountFieldValuesUpdateProcessorFactory"&gt;
* &lt;str name="fieldName"&gt;category_count&lt;/str&gt;
* &lt;/processor&gt;
* &lt;processor class="solr.DefaultValueUpdateProcessorFactory"&gt;
* &lt;str name="fieldName"&gt;category_count&lt;/str&gt;
* &lt;int name="value"&gt;0&lt;/int&gt;
* &lt;/processor&gt;
* &lt;/updateRequestProcessorChain&gt;
* </pre>
*
* <p>
* <b>NOTE:</b> The use of {@link DefaultValueUpdateProcessorFactory} is
* important in this example to ensure that all documents have a value for the
* <code>category_count</code> field, because
* <code>CountFieldValuesUpdateProcessorFactory</code> only <i>replaces</i> the
* list of values with the size of that list. If
* <code>DefaultValueUpdateProcessorFactory</code> was not used, then any
* document that had no values for the <code>category</code> field, would also
* have no value in the <code>category_count</code> field.
* </p>
*/
public final class CountFieldValuesUpdateProcessorFactory extends FieldMutatingUpdateProcessorFactory {
@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest req,
SolrQueryResponse rsp,
UpdateRequestProcessor next) {
return new FieldMutatingUpdateProcessor(getSelector(), next) {
protected SolrInputField mutate(final SolrInputField src) {
SolrInputField result = new SolrInputField(src.getName());
result.setValue(src.getValueCount(),
src.getBoost());
return result;
}
};
}
}

View File

@ -231,6 +231,12 @@
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="count">
<processor class="solr.CountFieldValuesUpdateProcessorFactory">
<str name="fieldName">count_field</str>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="ignore-not-in-schema">
<processor class="solr.IgnoreFieldUpdateProcessorFactory" />
</updateRequestProcessorChain>
@ -344,6 +350,20 @@
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="clone-then-count">
<processor class="solr.CloneFieldUpdateProcessorFactory">
<str name="source">category</str>
<str name="dest">category_count</str>
</processor>
<processor class="solr.CountFieldValuesUpdateProcessorFactory">
<str name="fieldName">category_count</str>
</processor>
<processor class="solr.DefaultValueUpdateProcessorFactory">
<str name="fieldName">category_count</str>
<int name="value">0</int>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="regex-replace">
<processor class="solr.RegexReplaceProcessorFactory">
<str name="fieldName">content</str>

View File

@ -719,6 +719,58 @@ public class FieldMutatingUpdateProcessorTest extends UpdateProcessorTestBase {
assertEquals(Arrays.asList(87, 78),
d.getFieldValues("all_prices"));
}
public void testCountValues() throws Exception {
SolrInputDocument d = null;
// trivial
d = processAdd("count",
doc(f("id", "1111"),
f("count_field", "aaa", "bbb", "ccc")));
assertNotNull(d);
assertEquals(3, d.getFieldValue("count_field"));
// edge case: no values to count, means no count
// (use default if you want one)
d = processAdd("count",
doc(f("id", "1111")));
assertNotNull(d);
assertFalse(d.containsKey("count_field"));
// typical usecase: clone and count
d = processAdd("clone-then-count",
doc(f("id", "1111"),
f("category", "scifi", "war", "space"),
f("editors", "John W. Campbell"),
f("list_price", 1000)));
assertNotNull(d);
assertEquals(Arrays.asList("scifi", "war", "space"),
d.getFieldValues("category"));
assertEquals(3,
d.getFieldValue("category_count"));
assertEquals(Arrays.asList("John W. Campbell"),
d.getFieldValues("editors"));
assertEquals(1000,d.getFieldValue("list_price"));
// typical usecase: clone and count demonstrating default
d = processAdd("clone-then-count",
doc(f("id", "1111"),
f("editors", "Anonymous"),
f("list_price", 1000)));
assertNotNull(d);
assertEquals(0,
d.getFieldValue("category_count"));
assertEquals(Arrays.asList("Anonymous"),
d.getFieldValues("editors"));
assertEquals(1000,d.getFieldValue("list_price"));
}
public void testCloneCombinations() throws Exception {