SOLR-2584: add UniqFieldsUpdateProcessor

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1149050 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2011-07-21 07:09:27 +00:00
parent 857ec84c15
commit 1464513f34
5 changed files with 251 additions and 0 deletions

View File

@ -313,6 +313,9 @@ New Features
* LUCENE-2048: Added omitPositions to the schema, so you can omit position * LUCENE-2048: Added omitPositions to the schema, so you can omit position
information while still indexing term frequencies. (rmuir) information while still indexing term frequencies. (rmuir)
* SOLR-2584: add UniqFieldsUpdateProcessor that removes duplicate values in the
specified fields. (Elmer Garduno, koji)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -0,0 +1,108 @@
package org.apache.solr.update.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
/**
* A non-duplicate processor. Removes duplicates in the specified fields.
*
* <pre class="prettyprint" >
* &lt;updateRequestProcessorChain name="uniq-fields"&gt;
* &lt;processor class="org.apache.solr.update.processor.UniqFieldsUpdateProcessorFactory"&gt;
* &lt;lst name="fields"&gt;
* &lt;str&gt;uniq&lt;/str&gt;
* &lt;str&gt;uniq2&lt;/str&gt;
* &lt;str&gt;uniq3&lt;/str&gt;
* &lt;/lst&gt;
* &lt;/processor&gt;
* &lt;processor class="solr.RunUpdateProcessorFactory" /&gt;
* &lt;/updateRequestProcessorChain&gt;</pre>
*
*/
public class UniqFieldsUpdateProcessorFactory extends UpdateRequestProcessorFactory {
private Set<String> fields;
@SuppressWarnings("unchecked")
@Override
public void init(@SuppressWarnings("rawtypes") NamedList args) {
NamedList<String> flst = (NamedList<String>)args.get("fields");
if(flst != null){
fields = new HashSet<String>();
for(int i = 0; i < flst.size(); i++){
fields.add(flst.getVal(i));
}
}
}
@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest req,
SolrQueryResponse rsp,
UpdateRequestProcessor next) {
return new UniqFieldsUpdateProcessor(next, fields);
}
public class UniqFieldsUpdateProcessor extends UpdateRequestProcessor {
private final Set<String> fields;
public UniqFieldsUpdateProcessor(UpdateRequestProcessor next,
Set<String> fields) {
super(next);
this.fields = fields;
}
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
if(fields != null){
SolrInputDocument solrInputDocument = cmd.getSolrInputDocument();
List<Object> uniqList = new ArrayList<Object>();
for (String field : fields) {
uniqList.clear();
Collection<Object> col = solrInputDocument.getFieldValues(field);
if (col != null) {
for (Object o : col) {
if(!uniqList.contains(o))
uniqList.add(o);
}
solrInputDocument.remove(field);
for (Object o : uniqList) {
solrInputDocument.addField(field, o);
}
}
}
}
super.processAdd(cmd);
}
}
}

View File

@ -523,6 +523,13 @@
<field name="pointD" type="xyd" indexed="true" stored="true" multiValued="false"/> <field name="pointD" type="xyd" indexed="true" stored="true" multiValued="false"/>
<field name="point_hash" type="geohash" indexed="true" stored="true" multiValued="false"/> <field name="point_hash" type="geohash" indexed="true" stored="true" multiValued="false"/>
<field name="store" type="location" indexed="true" stored="true"/> <field name="store" type="location" indexed="true" stored="true"/>
<!-- to test uniq fields -->
<field name="uniq" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="uniq2" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="uniq3" type="string" indexed="true" stored="true"/>
<field name="nouniq" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false"/> <dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false"/>

View File

@ -491,5 +491,15 @@
</processor> </processor>
<processor class="solr.RunUpdateProcessorFactory" /> <processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain> </updateRequestProcessorChain>
<updateRequestProcessorChain name="uniq-fields">
<processor class="org.apache.solr.update.processor.UniqFieldsUpdateProcessorFactory">
<lst name="fields">
<str>uniq</str>
<str>uniq2</str>
<str>uniq3</str>
</lst>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
</config> </config>

View File

@ -0,0 +1,123 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.MultiMapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.XmlUpdateRequestHandler;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.response.SolrQueryResponse;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
/**
*
*/
public class UniqFieldsUpdateProcessorFactoryTest extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml", "schema12.xml");
}
@Override
@Before
public void setUp() throws Exception {
super.setUp();
clearIndex();
assertU(commit());
}
@Test
public void testUniqFields() throws Exception {
SolrCore core = h.getCore();
UpdateRequestProcessorChain chained = core
.getUpdateProcessingChain("uniq-fields");
UniqFieldsUpdateProcessorFactory factory = ((UniqFieldsUpdateProcessorFactory) chained
.getFactories()[0]);
assertNotNull(chained);
addDoc(adoc("id", "1a",
"uniq", "value1",
"uniq", "value1",
"uniq", "value2"));
addDoc(adoc("id", "2a",
"uniq2", "value1",
"uniq2", "value2",
"uniq2", "value1",
"uniq2", "value3",
"uniq", "value1",
"uniq", "value1"));
addDoc(adoc("id", "1b",
"uniq3", "value1",
"uniq3", "value1"));
addDoc(adoc("id", "1c",
"nouniq", "value1",
"nouniq", "value1",
"nouniq", "value2"));
addDoc(adoc("id", "2c",
"nouniq", "value1",
"nouniq", "value1",
"nouniq", "value2",
"uniq2", "value1",
"uniq2", "value1"));
assertU(commit());
assertQ(req("id:1a"), "count(//*[@name='uniq']/*)=2",
"//arr[@name='uniq']/str[1][.='value1']",
"//arr[@name='uniq']/str[2][.='value2']");
assertQ(req("id:2a"), "count(//*[@name='uniq2']/*)=3",
"//arr[@name='uniq2']/str[1][.='value1']",
"//arr[@name='uniq2']/str[2][.='value2']",
"//arr[@name='uniq2']/str[3][.='value3']");
assertQ(req("id:2a"), "count(//*[@name='uniq']/*)=1");
assertQ(req("id:1b"), "count(//*[@name='uniq3'])=1");
assertQ(req("id:1c"), "count(//*[@name='nouniq']/*)=3");
assertQ(req("id:2c"), "count(//*[@name='nouniq']/*)=3");
assertQ(req("id:2c"), "count(//*[@name='uniq2']/*)=1");
}
private void addDoc(String doc) throws Exception {
Map<String, String[]> params = new HashMap<String, String[]>();
MultiMapSolrParams mmparams = new MultiMapSolrParams(params);
params.put(UpdateParams.UPDATE_CHAIN, new String[] { "uniq-fields" });
SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(),
(SolrParams) mmparams) {
};
XmlUpdateRequestHandler handler = new XmlUpdateRequestHandler();
handler.init(null);
ArrayList<ContentStream> streams = new ArrayList<ContentStream>(2);
streams.add(new ContentStreamBase.StringStream(doc));
req.setContentStreams(streams);
handler.handleRequestBody(req, new SolrQueryResponse());
req.close();
}
}