SOLR-4249: UniqFieldsUpdateProcessorFactory now extends FieldMutatingUpdateProcessorFactory and supports all of it's selector options

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1518717 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2013-08-29 17:31:17 +00:00
parent d339f4278c
commit 13bedbe616
4 changed files with 90 additions and 58 deletions

View File

@ -71,6 +71,11 @@ Upgrading from Solr 4.4.0
XXXXX and delete the ones that you do not wish to use. See SOLR-4953 &
SOLR-5108 for more details.
* The UniqFieldsUpdateProcessorFactory has been improved to support all of the
FieldMutatingUpdateProcessorFactory selector options. The <lst named="fields">
init param option is now deprecated and should be replaced with the more standard
<arr name="fieldName">. See SOLR-4249 for more details.
Detailed Change List
----------------------
@ -103,6 +108,11 @@ New Features
* SOLR-5182: Add NoOpRegenerator, a regenerator for custom per-segment caches
where items are preserved across commits. (Robert Muir)
* SOLR-4249: UniqFieldsUpdateProcessorFactory now extends
FieldMutatingUpdateProcessorFactory and supports all of it's selector options. Use
of the "fields" init param is now deprecated in favor of "fieldName" (hossman)
Bug Fixes
----------------------

View File

@ -23,6 +23,9 @@ import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Map;
import org.apache.solr.core.SolrCore;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.util.NamedList;
@ -30,77 +33,68 @@ import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
/**
* A non-duplicate processor. Removes duplicates in the specified fields.
*
* <pre class="prettyprint" >
* &lt;updateRequestProcessorChain name="uniq-fields"&gt;
* &lt;processor class="org.apache.solr.update.processor.UniqFieldsUpdateProcessorFactory"&gt;
* &lt;lst name="fields"&gt;
* &lt;str&gt;uniq&lt;/str&gt;
* &lt;str&gt;uniq2&lt;/str&gt;
* &lt;str&gt;uniq3&lt;/str&gt;
* &lt;/lst&gt;
* &lt;/processor&gt;
* &lt;processor class="solr.RunUpdateProcessorFactory" /&gt;
* &lt;/updateRequestProcessorChain&gt;</pre>
*
*/
public class UniqFieldsUpdateProcessorFactory extends UpdateRequestProcessorFactory {
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
private Set<String> fields;
/**
* Removes duplicate values found in fields matching the specified conditions.
* The existing field values are iterated in order, and values are removed when
* they are equal to a value that has already been seen for this field.
* <p>
* By default this processor matches no fields.
* </p>
*
* <p>
* In the example configuration below, if a document initially contains the values
* <code>"Steve","Lucy","Jim",Steve","Alice","Bob","Alice"</code> in a field named
* <code>foo_uniq</code> then using this processor will result in the final list of
* field values being <code>"Steve","Lucy","Jim","Alice","Bob"</code>
* </p>
* <pre class="prettyprint">
* &lt;processor class="solr.UniqFieldsUpdateProcessorFactory"&gt;
* &lt;str name="fieldRegex"&gt;.*_uniq&lt;/str&gt;
* &lt;/processor&gt;
* </pre>
*/
public class UniqFieldsUpdateProcessorFactory extends FieldValueSubsetUpdateProcessorFactory {
public final static Logger log = LoggerFactory.getLogger(UniqFieldsUpdateProcessorFactory.class);
@Override
public FieldMutatingUpdateProcessor.FieldNameSelector
getDefaultSelector(final SolrCore core) {
return FieldMutatingUpdateProcessor.SELECT_NO_FIELDS;
}
@SuppressWarnings("unchecked")
@Override
public void init(@SuppressWarnings("rawtypes") NamedList args) {
NamedList<String> flst = (NamedList<String>)args.get("fields");
// legacy init param support, will be removed in 5.0
// no idea why this was ever implimented as <lst> should have just been <arr>
NamedList<String> flst = (NamedList<String>) args.remove("fields");
if(flst != null){
fields = new HashSet<String>();
for(int i = 0; i < flst.size(); i++){
fields.add(flst.getVal(i));
log.warn("Use of the 'fields' init param in UniqFieldsUpdateProcessorFactory is deprecated, please use 'fieldName' (or another FieldMutatingUpdateProcessorFactory selector option) instead");
log.info("Replacing 'fields' init param with (individual) 'fieldName' params");
for (Map.Entry<String,String> entry : flst) {
args.add("fieldName", entry.getValue());
}
}
super.init(args);
}
@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest req,
SolrQueryResponse rsp,
UpdateRequestProcessor next) {
return new UniqFieldsUpdateProcessor(next, fields);
}
public class UniqFieldsUpdateProcessor extends UpdateRequestProcessor {
private final Set<String> fields;
public UniqFieldsUpdateProcessor(UpdateRequestProcessor next,
Set<String> fields) {
super(next);
this.fields = fields;
}
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
if(fields != null){
SolrInputDocument solrInputDocument = cmd.getSolrInputDocument();
List<Object> uniqList = new ArrayList<Object>();
for (String field : fields) {
uniqList.clear();
Collection<Object> col = solrInputDocument.getFieldValues(field);
if (col != null) {
for (Object o : col) {
if(!uniqList.contains(o))
uniqList.add(o);
}
solrInputDocument.remove(field);
for (Object o : uniqList) {
solrInputDocument.addField(field, o);
}
}
}
@SuppressWarnings("unchecked")
public Collection pickSubset(Collection values) {
Set<Object> uniqs = new HashSet<Object>();
List<Object> result = new ArrayList<Object>(values.size());
for (Object o : values) {
if (!uniqs.contains(o)) {
uniqs.add(o);
result.add(o);
}
super.processAdd(cmd);
}
return result;
}
}

View File

@ -429,6 +429,12 @@
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="uniq-values">
<processor class="solr.UniqFieldsUpdateProcessorFactory">
<str name="fieldRegex">uniq_.*</str>
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="pre-analyzed-simple">
<processor class="solr.PreAnalyzedUpdateProcessorFactory">
<str name="fieldName">subject</str>

View File

@ -94,6 +94,28 @@ public class FieldMutatingUpdateProcessorTest extends UpdateProcessorTestBase {
5.0F, d.getField("foo_s").getBoost(), 0.0F);
}
public void testUniqValues() throws Exception {
final String chain = "uniq-values";
SolrInputDocument d = null;
d = processAdd(chain,
doc(f("id", "1111"),
f("name", "Hoss", "Man", "Hoss"),
f("uniq_1_s", "Hoss", "Man", "Hoss"),
f("uniq_2_s", "Foo", "Hoss", "Man", "Hoss", "Bar"),
f("uniq_3_s", 5.0F, 23, "string", 5.0F)));
assertNotNull(d);
assertEquals(Arrays.asList("Hoss", "Man", "Hoss"),
d.getFieldValues("name"));
assertEquals(Arrays.asList("Hoss","Man"),
d.getFieldValues("uniq_1_s"));
assertEquals(Arrays.asList("Foo","Hoss","Man","Bar"),
d.getFieldValues("uniq_2_s"));
assertEquals(Arrays.asList(5.0F, 23, "string"),
d.getFieldValues("uniq_3_s"));
}
public void testTrimFields() throws Exception {
for (String chain : Arrays.asList("trim-fields", "trim-fields-arr")) {
SolrInputDocument d = null;