diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 19dd8a1fd27..8f9bc007c76 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -207,6 +207,21 @@ New Features
* SOLR-1726: Added deep paging support to search (sort by score only) which should use less memory when paging deeply into results
by keeping the priority queue small. (Manojkumar Rangasamy Kannadasan, gsingers)
+* SOLR-2802: New FieldMutatingUpdateProcessor and Factory to simlify the
+ development of UpdateProcessors that modify field values of documents as
+ they are indexed. Also includes several useful new implementations:
+ RemoveBlankFieldUpdateProcessorFactory
+ TrimFieldUpdateProcessorFactory
+ HTMLStripFieldUpdateProcessorFactory
+ RegexReplaceProcessorFactory
+ FieldLengthUpdateProcessorFactory
+ ConcatFieldUpdateProcessorFactory
+ FirstFieldValueUpdateProcessorFactory
+ LastFieldValueUpdateProcessorFactory
+ MinFieldValueUpdateProcessorFactory
+ MaxFieldValueUpdateProcessorFactory
+ (hossman, janhoy)
+
Optimizations
----------------------
diff --git a/solr/core/src/java/org/apache/solr/update/processor/ConcatFieldUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ConcatFieldUpdateProcessorFactory.java
new file mode 100644
index 00000000000..26aca06416b
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/update/processor/ConcatFieldUpdateProcessorFactory.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.schema.TextField;
+import org.apache.solr.schema.StrField;
+
+import org.apache.solr.common.SolrInputField;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * Concatenates multiple values for fields matching the specified
+ * conditions using a configurable delimiter which defaults
+ * to " ,".
+ *
+ * By default, this processor concatenates the values for any field name
+ * which according to the schema is multiValued="false"
+ * and uses TextField or StrField
+ *
+ *
+ *
+ * For example, in the configuration below, any "single valued" string and
+ * text field which is found to contain multiple values except for
+ * the primary_author field will be concatenated using the
+ * string " ;" as a delimeter. For the
+ * primary_author field, the multiple values will be left
+ * alone for FirstFieldValueUpdateProcessorFactory to deal with.
+ *
+ */
+public final class ConcatFieldUpdateProcessorFactory extends FieldMutatingUpdateProcessorFactory {
+
+ String delimiter = ", ";
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public void init(NamedList args) {
+ Object d = args.remove("delimiter");
+ if (null != d) delimiter = d.toString();
+
+ super.init(args);
+ }
+
+ @Override
+ public UpdateRequestProcessor getInstance(SolrQueryRequest req,
+ SolrQueryResponse rsp,
+ UpdateRequestProcessor next) {
+ return new FieldMutatingUpdateProcessor(getSelector(), next) {
+ protected SolrInputField mutate(final SolrInputField src) {
+ if (src.getValueCount() <= 1) return src;
+
+ SolrInputField result = new SolrInputField(src.getName());
+ result.setValue(StringUtils.join(src.getValues(), delimiter),
+ src.getBoost());
+ return result;
+ }
+ };
+ }
+
+ @Override
+ public FieldMutatingUpdateProcessor.FieldNameSelector
+ getDefaultSelector(final SolrCore core) {
+
+ final IndexSchema schema = core.getSchema();
+ return new FieldMutatingUpdateProcessor.FieldNameSelector() {
+ public boolean shouldMutate(final String fieldName) {
+
+ // first check type since it should be fastest
+ FieldType type = schema.getFieldTypeNoEx(fieldName);
+ if (null == type) return false;
+
+ if (! (TextField.class.isInstance(type)
+ || StrField.class.isInstance(type))) {
+ return false;
+ }
+
+ // only ask for SchemaField if we passed the type check.
+ SchemaField sf = schema.getFieldOrNull(fieldName);
+ // shouldn't be null since since type wasn't, but just in case
+ if (null == sf) return false;
+
+ return ! sf.multiValued();
+ }
+ };
+ }
+
+}
+
diff --git a/solr/core/src/java/org/apache/solr/update/processor/FieldLengthUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/FieldLengthUpdateProcessorFactory.java
new file mode 100644
index 00000000000..8ffa6fef84b
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/update/processor/FieldLengthUpdateProcessorFactory.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.core.SolrCore;
+
+
+/**
+ * Replaces any CharSequence values found in fields matching the specified
+ * conditions with the lengths of those CharSequences (as an Integer).
+ *
+ * By default, this processor matches no fields.
+ *
+ *
For example, with the configuration listed below any documents
+ * containing String values (such as "abcdef" or
+ * "xyz") in a field declared in the schema using
+ * TrieIntField or TrieLongField
+ * would have those Strings replaced with the length of those fields as an
+ * Integer
+ * (ie: 6 and 3 respectively)
+ *
+ */
+public final class FieldLengthUpdateProcessorFactory extends FieldMutatingUpdateProcessorFactory {
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public void init(NamedList args) {
+ // no length specific init args
+ super.init(args);
+ }
+
+ @Override
+ public FieldMutatingUpdateProcessor.FieldNameSelector
+ getDefaultSelector(final SolrCore core) {
+
+ return FieldMutatingUpdateProcessor.SELECT_NO_FIELDS;
+
+ }
+
+ @Override
+ public UpdateRequestProcessor getInstance(SolrQueryRequest req,
+ SolrQueryResponse rsp,
+ UpdateRequestProcessor next) {
+ return new FieldValueMutatingUpdateProcessor(getSelector(), next) {
+ protected Object mutateValue(final Object src) {
+ if (src instanceof CharSequence) {
+ return new Integer(((CharSequence)src).length());
+ }
+ return src;
+ }
+ };
+ }
+}
+
diff --git a/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessor.java
new file mode 100644
index 00000000000..87f22016ada
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessor.java
@@ -0,0 +1,283 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import static org.apache.solr.common.SolrException.ErrorCode.*;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.FieldType;
+
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+
+/**
+ * Reusable base class for UpdateProcessors that will consider
+ * AddUpdateCommands and mutate the values assocaited with configured
+ * fields.
+ *
+ * Subclasses should override the mutate method to specify how individual
+ * SolrInputFields identified by the selector associated with this instance
+ * will be mutated.
+ *
+ *
+ * @see FieldMutatingUpdateProcessorFactory
+ * @see FieldValueMutatingUpdateProcessor
+ * @see FieldNameSelector
+ */
+public abstract class FieldMutatingUpdateProcessor
+ extends UpdateRequestProcessor {
+
+ private final FieldNameSelector selector;
+ public FieldMutatingUpdateProcessor(FieldNameSelector selector,
+ UpdateRequestProcessor next) {
+ super(next);
+ this.selector = selector;
+ }
+
+ /**
+ * Method for mutating SolrInputFields associated with fields identified
+ * by the FieldNameSelector associated with this processor
+ * @param src the SolrInputField to mutate, may be modified in place and
+ * returned
+ * @return the SolrInputField to use in replacing the original (src) value.
+ * If null the field will be removed.
+ */
+ protected abstract SolrInputField mutate(final SolrInputField src);
+
+ @Override
+ public void processAdd(AddUpdateCommand cmd) throws IOException {
+ final SolrInputDocument doc = cmd.getSolrInputDocument();
+
+ // make a copy we can iterate over while mutating the doc
+ final Collection fieldNames
+ = new ArrayList(doc.getFieldNames());
+
+ for (final String fname : fieldNames) {
+
+ if (! selector.shouldMutate(fname)) continue;
+
+ final SolrInputField src = doc.get(fname);
+ final SolrInputField dest = mutate(src);
+ if (null == dest) {
+ doc.remove(fname);
+ } else {
+ // semantics of what happens if dest has diff name are hard
+ // we could treat it as a copy, or a rename
+ // for now, don't allow it.
+ if (! fname.equals(dest.getName()) ) {
+ throw new SolrException(SERVER_ERROR,
+ "mutute returned field with different name: "
+ + fname + " => " + dest.getName());
+ }
+ doc.put(dest.getName(), dest);
+ }
+ }
+ super.processAdd(cmd);
+ }
+
+ /**
+ * Interface for idenfifying which fileds should be mutated
+ */
+ public static interface FieldNameSelector {
+ public boolean shouldMutate(final String fieldName);
+ }
+
+ /** Singleton indicating all fields should be mutated */
+ public static final FieldNameSelector SELECT_ALL_FIELDS
+ = new FieldNameSelector() {
+ public boolean shouldMutate(final String fieldName) {
+ return true;
+ }
+ };
+
+ /** Singleton indicating no fields should be mutated */
+ public static final FieldNameSelector SELECT_NO_FIELDS
+ = new FieldNameSelector() {
+ public boolean shouldMutate(final String fieldName) {
+ return false;
+ }
+ };
+
+ /**
+ * Wraps two FieldNameSelectors such that the FieldNameSelector
+ * returned matches all fields specified by the "includes" unless they
+ * are matched by "excludes"
+ * @param includes a selector identifying field names that should be selected
+ * @param excludes a selector identifying field names that should be
+ * not be selected, even if they are matched by the 'includes'
+ * selector
+ * @return Either a new FieldNameSelector or one of the input selecors
+ * if the combination lends itself to optimization.
+ */
+ public static FieldNameSelector wrap(final FieldNameSelector includes,
+ final FieldNameSelector excludes) {
+
+ if (SELECT_NO_FIELDS == excludes) {
+ return includes;
+ }
+
+ if (SELECT_ALL_FIELDS == excludes) {
+ return SELECT_NO_FIELDS;
+ }
+
+ if (SELECT_ALL_FIELDS == includes) {
+ return new FieldNameSelector() {
+ public boolean shouldMutate(final String fieldName) {
+ return ! excludes.shouldMutate(fieldName);
+ }
+ };
+ }
+
+ return new FieldNameSelector() {
+ public boolean shouldMutate(final String fieldName) {
+ return (includes.shouldMutate(fieldName)
+ && ! excludes.shouldMutate(fieldName));
+ }
+ };
+ }
+
+ /**
+ * Utility method that can be used to define a FieldNameSelector
+ * using the same types of rules as the FieldMutatingUpdateProcessor init
+ * code. This may be useful for Factories that wish to define default
+ * selectors in similar terms to what the configuration would look like.
+ * @lucene.internal
+ */
+ public static FieldNameSelector createFieldNameSelector
+ (final SolrResourceLoader loader,
+ final IndexSchema schema,
+ final Set fields,
+ final Set typeNames,
+ final Collection typeClasses,
+ final Collection regexes,
+ final FieldNameSelector defSelector) {
+
+ final Collection classes
+ = new ArrayList(typeClasses.size());
+
+ for (String t : typeClasses) {
+ try {
+ classes.add(loader.findClass(t));
+ } catch (Exception e) {
+ throw new SolrException(SERVER_ERROR,
+ "Can't resolve typeClass: " + t, e);
+ }
+ }
+
+ if (classes.isEmpty() &&
+ typeNames.isEmpty() &&
+ regexes.isEmpty() &&
+ fields.isEmpty()) {
+ return defSelector;
+ }
+
+ return new ConfigurableFieldNameSelector
+ (schema, fields, typeNames, classes, regexes);
+ }
+
+ private static final class ConfigurableFieldNameSelector
+ implements FieldNameSelector {
+
+ final IndexSchema schema;
+ final Set fields;
+ final Set typeNames;
+ final Collection classes;
+ final Collection regexes;
+
+ private ConfigurableFieldNameSelector(final IndexSchema schema,
+ final Set fields,
+ final Set typeNames,
+ final Collection classes,
+ final Collection regexes) {
+ this.schema = schema;
+ this.fields = fields;
+ this.typeNames = typeNames;
+ this.classes = classes;
+ this.regexes = regexes;
+ }
+
+ public boolean shouldMutate(final String fieldName) {
+
+ // order of checks is bsaed on what should be quicker
+ // (ie: set lookups faster the looping over instanceOf / matches tests
+
+ if ( ! (fields.isEmpty() || fields.contains(fieldName)) ) {
+ return false;
+ }
+
+ // do not consider it an error if the fieldName has no type
+ // there might be another processor dealing with it later
+ FieldType t = schema.getFieldTypeNoEx(fieldName);
+ if (null != t) {
+ if (! (typeNames.isEmpty() || typeNames.contains(t.getTypeName())) ) {
+ return false;
+ }
+
+ if (! (classes.isEmpty() || instanceOfAny(t, classes)) ) {
+ return false;
+ }
+ }
+
+ if (! (regexes.isEmpty() || matchesAny(fieldName, regexes)) ) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * returns true if the Object 'o' is an instance of any class in
+ * the Collection
+ */
+ private static boolean instanceOfAny(Object o, Collection classes) {
+ for (Class c : classes) {
+ if ( c.isInstance(o) ) return true;
+ }
+ return false;
+ }
+
+ /**
+ * returns true if the CharSequence 's' matches any Pattern in the
+ * Collection
+ */
+ private static boolean matchesAny(CharSequence s,
+ Collection regexes) {
+ for (Pattern p : regexes) {
+ if (p.matcher(s).matches()) return true;
+ }
+ return false;
+ }
+ }
+}
+
diff --git a/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java
new file mode 100644
index 00000000000..2e3a1cb4681
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/update/processor/FieldMutatingUpdateProcessorFactory.java
@@ -0,0 +1,284 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.common.SolrException;
+import static org.apache.solr.common.SolrException.ErrorCode.*;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.util.plugin.SolrCoreAware;
+
+
+/**
+ * Base class for implementing Factories for FieldMutatingUpdateProcessors and
+ * FieldValueMutatingUpdateProcessors.
+ *
+ *
+ * This class provides all of the plumbing for configuring the
+ * FieldNameSelector using the following init params to specify selection
+ * critera...
+ *
+ *
+ *
fieldName - selecting specific fields by field name lookup
+ *
fieldRegex - selecting specific fields by field name regex match (regexes are checked in the order specified)
+ *
typeName - selecting specific fields by fieldType name lookup
+ *
typeClass - selecting specific fields by fieldType class lookup, including inheritence and interfaces
+ *
+ *
+ *
+ * Each critera can specified as either an <arr> of <str>, or
+ * multiple <str> with the same name. When multiple criteria of a
+ * single type exist, fields must match at least one to be selected.
+ * If more then one type of critera exist, fields must match
+ * at least one of each to be selected.
+ *
+ *
+ * One or more excludes <lst> params may also be specified,
+ * containing any of the above criteria, identifying fields to be excluded
+ * from seelction even if they match the selection criteria. As with the main
+ * selection critiera a field must match all of criteria in a single exclusion
+ * in order to be excluded, but multiple exclusions may be specified to get an
+ * OR behavior
+ *
+ *
+ *
+ * In the ExampleFieldMutatingUpdateProcessorFactory configured below,
+ * fields will be mutated if the name starts with "foo" or "bar";
+ * unless the field name contains the substring "SKIP" or
+ * the fieldType is (or subclasses) DateField. Meaning a field named
+ * "foo_SKIP" is gaurunteed not to be selected, but a field named "bar_smith"
+ * that uses StrField will be selected.
+ *
+ * Subclasses define the default selection behavior to be applied if no
+ * criteria is configured by the user. User configured "exclude" criteria
+ * will be applied to the subclass defined default selector.
+ *
+ *
+ * @see FieldMutatingUpdateProcessor
+ * @see FieldValueMutatingUpdateProcessor
+ * @see FieldMutatingUpdateProcessor.FieldNameSelector
+ */
+public abstract class FieldMutatingUpdateProcessorFactory
+ extends UpdateRequestProcessorFactory
+ implements SolrCoreAware {
+
+ private static class SelectorParams {
+ public Set fieldName = Collections.emptySet();
+ public Set typeName = Collections.emptySet();
+ public Collection typeClass = Collections.emptyList();
+ public Collection fieldRegex = Collections.emptyList();
+ }
+
+ private SelectorParams inclusions = new SelectorParams();
+ private Collection exclusions
+ = new ArrayList();
+
+ private FieldMutatingUpdateProcessor.FieldNameSelector selector = null;
+
+ protected final FieldMutatingUpdateProcessor.FieldNameSelector getSelector() {
+ if (null != selector) return selector;
+
+ throw new SolrException(SERVER_ERROR, "selector was never initialized, "+
+ " inform(SolrCore) never called???");
+ }
+
+ @SuppressWarnings("unchecked")
+ private static final SelectorParams parseSelectorParams(NamedList args) {
+ SelectorParams params = new SelectorParams();
+
+ params.fieldName = new HashSet(oneOrMany(args, "fieldName"));
+ params.typeName = new HashSet(oneOrMany(args, "typeName"));
+
+ // we can compile the patterns now
+ Collection patterns = oneOrMany(args, "fieldRegex");
+ if (! patterns.isEmpty()) {
+ params.fieldRegex = new ArrayList(patterns.size());
+ for (String s : patterns) {
+ try {
+ params.fieldRegex.add(Pattern.compile(s));
+ } catch (PatternSyntaxException e) {
+ throw new SolrException
+ (SERVER_ERROR, "Invalid 'fieldRegex' pattern: " + s, e);
+ }
+ }
+ }
+
+ // resolve this into actual Class objects later
+ params.typeClass = oneOrMany(args, "typeClass");
+
+ return params;
+ }
+
+
+ /**
+ * Handles common initialization related to source fields for
+ * constructoring the FieldNameSelector to be used.
+ *
+ * Will error if any unexpected init args are found, so subclasses should
+ * remove any subclass-specific init args before calling this method.
+ */
+ @SuppressWarnings("unchecked")
+ @Override
+ public void init(NamedList args) {
+
+ inclusions = parseSelectorParams(args);
+
+ List