SOLR-11774: langid.map.individual now works together with langid.map.keepOrig

2019-01-03 18:26:27 +01:00 · 2019-01-03 18:26:27 +01:00 · 00f8f3a13a
parent 6342ec699e
commit 00f8f3a13a
8 changed files with 489 additions and 126 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -76,6 +76,9 @@ Upgrade Notes
  This choice used to be toggleable with an internal/expert "anonChildDocs" parameter flag which is now gone.
  (David Smiley)

+* SOLR-11774: In 'langid' contrib, the LanguageIdentifierUpdateProcessor base class changed some method signatures. 
+  If you have a custom language identifier implementation you will need to adapt your code.
+  
 New Features
 ----------------------

@ -100,6 +103,9 @@ Bug Fixes

 * SOLR-13058: Fix block that was synchronizing on the wrong collection in OverseerTaskProcessor (Gus Heck)

+* SOLR-11774: langid.map.individual now works together with langid.map.keepOrig. Also the detectLanguage() API 
+  is changed to accept a Reader allowing for more memory efficient implementations (janhoy)
+
 Improvements
 ----------------------

--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
@ -16,9 +16,10 @@
 */
 package org.apache.solr.update.processor;

+import java.io.IOException;
+import java.io.Reader;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.Collections;
 import java.util.List;

@ -29,7 +30,6 @@ import com.cybozu.labs.langdetect.Detector;
 import com.cybozu.labs.langdetect.DetectorFactory;
 import com.cybozu.labs.langdetect.LangDetectException;
 import com.cybozu.labs.langdetect.Language;
-import org.apache.solr.common.SolrInputDocument;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@ -48,33 +48,26 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
    super(req, rsp, next);
  }

+  /**
+   * Detects language(s) from a reader, typically based on some fields in SolrInputDocument
+   * Classes wishing to implement their own language detection module should override this method.
+   *
+   * @param solrDocReader A reader serving the text from the document to detect
+   * @return List of detected language(s) according to RFC-3066
+   */
  @Override
-  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+  protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
    try {
      Detector detector = DetectorFactory.create();
      detector.setMaxTextLength(maxTotalChars);

-      for (String fieldName : inputFields) {
-        log.debug("Appending field " + fieldName);
-        if (doc.containsKey(fieldName)) {
-          Collection<Object> fieldValues = doc.getFieldValues(fieldName);
-          if (fieldValues != null) {
-            for (Object content : fieldValues) {
-              if (content instanceof String) {
-                String stringContent = (String) content;
-                if (stringContent.length() > maxFieldValueChars) {
-                  detector.append(stringContent.substring(0, maxFieldValueChars));
-                } else {
-                  detector.append(stringContent);
-                }
-                detector.append(" ");
-              } else {
-                log.warn("Field " + fieldName + " not a String value, not including in detection");
-              }
-            }
-          }
-        }
+      // TODO Work around bug in LangDetect 1.1 which does not expect a -1 return value at end of stream,
+      // but instead only looks at ready()
+      if (solrDocReader instanceof SolrInputDocumentReader) {
+        ((SolrInputDocumentReader)solrDocReader).setEodReturnValue(0);
      }
+      detector.append(solrDocReader);
+
      ArrayList<Language> langlist = detector.getProbabilities();
      ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
      for (Language l: langlist) {
@ -84,6 +77,9 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
    } catch (LangDetectException e) {
      log.debug("Could not determine language, returning empty list: ", e);
      return Collections.emptyList();
+    } catch (IOException e) {
+      log.warn("Could not determine language.", e);
+      return Collections.emptyList();
    }
  }
 }
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
@ -30,10 +30,10 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.IOException;
+import java.io.Reader;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@ -41,11 +41,11 @@ import java.util.regex.Pattern;


 /**
- * Identifies the language of a set of input fields.
- * Also supports mapping of field names based
- * on detected language.
 * <p>
- * See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
+ *   Identifies the language of a set of input fields.
+ *   Also supports mapping of field names based on detected language.
+ * </p>
+ * See <a href="https://lucene.apache.org/solr/guide/7_4/detecting-languages-during-indexing.html">Detecting Languages During Indexing</a> in reference guide
 * @since 3.5
 * @lucene.experimental
 */
@ -207,11 +207,10 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
  }

  /**
-   * This is the main, testable process method called from processAdd()
-   * @param doc the SolrInputDocument to work on
-   * @return the modified SolrInputDocument
+   * This is the main process method called from processAdd()
+   * @param doc the SolrInputDocument to modify
   */
-  protected SolrInputDocument process(SolrInputDocument doc) {
+  protected void process(SolrInputDocument doc) {
    String docLang = null;
    HashSet<String> docLangs = new HashSet<>();
    String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
@ -240,7 +239,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
        if(doc.containsKey(fieldName)) {
          String fieldLang;
          if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
-            List<DetectedLanguage> languagelist = detectLanguage(doc);
+            List<DetectedLanguage> languagelist = detectLanguage(solrDocReader(doc, new String[]{fieldName}));
            fieldLang = resolveLanguage(languagelist, docLang);
            docLangs.add(fieldLang);
            log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
@ -270,8 +269,6 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
    if(langsField != null && langsField.length() != 0) {
      doc.setField(langsField, docLangs.toArray());
    }
-
-    return doc;
  }

  /**
@ -297,12 +294,21 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
  }

  /**
-   * Detects language(s) from a string.
-   * Classes wishing to implement their own language detection module should override this method.
-   * @param content The content to identify
+   * Detects language(s) from all configured fields
+   * @param doc The solr document
   * @return List of detected language(s) according to RFC-3066
   */
-  protected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content);
+  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+    return detectLanguage(solrDocReader(doc, inputFields));
+  }
+
+  /**
+   * Detects language(s) from a reader, typically based on some fields in SolrInputDocument
+   * Classes wishing to implement their own language detection module should override this method.
+   * @param solrDocReader A reader serving the text from the document to detect
+   * @return List of detected language(s) according to RFC-3066
+   */
+  protected abstract List<DetectedLanguage> detectLanguage(Reader solrDocReader);

  /**
   * Chooses a language based on the list of candidates detected
@ -400,67 +406,22 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
    this.enabled = enabled;
  }

-
-
  /**
-   * Concatenates content from multiple fields
+   * Returns a reader that streams String content from fields.
+   * This is more memory efficient than building a full string buffer
+   * @param doc the solr document
+   * @param fields the field names to read
+   * @return a reader over the fields
+   */
+  protected SolrInputDocumentReader solrDocReader(SolrInputDocument doc, String[] fields) {
+    return new SolrInputDocumentReader(doc, fields, maxTotalChars, maxFieldValueChars, " ");
+  }
+  
+  /**
+   * Concatenates content from input fields defined in langid.fl.
+   * For test purposes only
   */
  protected String concatFields(SolrInputDocument doc) {
-    StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
-    for (String fieldName : inputFields) {
-      log.debug("Appending field " + fieldName);
-      if (doc.containsKey(fieldName)) {
-        Collection<Object> fieldValues = doc.getFieldValues(fieldName);
-        if (fieldValues != null) {
-          for (Object content : fieldValues) {
-            if (content instanceof String) {
-              String stringContent = (String) content;
-              if (stringContent.length() > maxFieldValueChars) {
-                sb.append(stringContent.substring(0, maxFieldValueChars));
-              } else {
-                sb.append(stringContent);
-              }
-              sb.append(" ");
-              if (sb.length() > maxTotalChars) {
-                sb.setLength(maxTotalChars);
-                break;
-              }
-            } else {
-              log.warn("Field " + fieldName + " not a String value, not including in detection");
-            }
-          }
-        }
-      }
-    }
-    return sb.toString();
-  }
-
-  /**
-   * Calculate expected string size.
-   *
-   * @param doc           solr input document
-   * @param fields        fields to select
-   * @return expected size of string value
-   */
-  private int getExpectedSize(SolrInputDocument doc, String[] fields) {
-    int docSize = 0;
-    for (String field : fields) {
-      if (doc.containsKey(field)) {
-        Collection<Object> contents = doc.getFieldValues(field);
-        if (contents != null) {
-          for (Object content : contents) {
-            if (content instanceof String) {
-              docSize += Math.min(((String) content).length(), maxFieldValueChars);
-            }
-          }
-
-          if (docSize > maxTotalChars) {
-            docSize = maxTotalChars;
-            break;
-          }
-        }
-      }
-    }
-    return docSize;
+    return SolrInputDocumentReader.asString(solrDocReader(doc, inputFields));
  }
 }
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
@ -16,6 +16,7 @@
 */
 package org.apache.solr.update.processor;

+import java.io.Reader;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.HashMap;
@ -23,7 +24,6 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;

-import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.slf4j.Logger;
@ -54,9 +54,9 @@ public class OpenNLPLangDetectUpdateProcessor extends LanguageIdentifierUpdatePr
  }

  @Override
-  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+  protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
    List<DetectedLanguage> languages = new ArrayList<>();
-    String content = concatFields(doc);
+    String content = SolrInputDocumentReader.asString(solrDocReader);
    if (content.length() != 0) {
      LanguageDetectorME ldme = new LanguageDetectorME(model);
      Language[] langs = ldme.predictLanguages(content);
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/SolrInputDocumentReader.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/SolrInputDocumentReader.java
@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.lang.invoke.MethodHandles;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reader on top of SolrInputDocument that can "stream" a document as a character stream in a memory
+ * efficient way, to avoid potentially large intermediate string buffers containing whole document content.
+ * @lucene.experimental
+ */
+public class SolrInputDocumentReader extends Reader {
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  private SolrInputDocument doc;
+  private final String[] fields;
+  private final String fieldValueSep;
+  private final int maxTotalChars;
+  private final int maxCharsPerFieldValue;
+  private int totalCharsConsumed;
+
+  // Remember where we are at
+  private int currentFieldIdx = 0;
+  private int currentFieldValueIdx = 0;
+  private int currentFieldValueOffset = 0;
+  private boolean eod = false;
+  // Normally a Reader will return -1 at end of document, but to work around LangDetect's bug, we allow another value
+  private int eodReturnValue = -1;
+
+  /**
+   * Creates a character-stream reader that streams all String fields in the document with space as separator 
+   *
+   * @param doc Solr input document
+   * @param maxCharsPerFieldValue max chars to consume per field value
+   * @param maxTotalChars max chars to consume total
+   */
+  public SolrInputDocumentReader(SolrInputDocument doc, int maxTotalChars, int maxCharsPerFieldValue) {
+    this(doc, getStringFields(doc), maxTotalChars, maxCharsPerFieldValue, " ");
+  }
+  
+  /**
+   * Creates a character-stream reader that reads the listed fields in order, with
+   * max lengths as specified.
+   *
+   * @param doc Solr input document
+   * @param fields list of field names to include
+   * @param fieldValueSep separator to insert between field values
+   * @param maxCharsPerFieldValue max chars to consume per field value
+   * @param maxTotalChars max chars to consume total
+   */
+  public SolrInputDocumentReader(SolrInputDocument doc, String[] fields, int maxTotalChars,
+                                 int maxCharsPerFieldValue, String fieldValueSep) {
+    this.doc = doc;
+    this.fields = fields;
+    this.fieldValueSep = fieldValueSep;
+    if (fields == null || fields.length == 0) throw new IllegalArgumentException("fields cannot be empty");
+    this.maxTotalChars = maxTotalChars;
+    this.maxCharsPerFieldValue = maxCharsPerFieldValue;
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    StringBuilder sb = new StringBuilder(len);
+    int numChars = fillBuffer(sb, len);
+
+    if (numChars > -1) {
+      sb.getChars(0, numChars, cbuf, off);
+    }
+    totalCharsConsumed += numChars;
+    return numChars;
+  }
+
+  private int fillBuffer(StringBuilder sb, int targetLen) {
+    if (eod) return eodReturnValue;
+    if (totalCharsConsumed + targetLen > maxTotalChars) {
+      targetLen = maxTotalChars - totalCharsConsumed;
+    }
+
+    while (sb.length() < targetLen && !eod) {
+      nextDocChunk(sb, targetLen);
+    }
+
+    if (sb.length() == 0) {
+      eod = true;
+      return eodReturnValue;
+    } else {
+      return sb.length();
+    }
+  }
+
+  private int nextDocChunk(StringBuilder sb, int maxChunkLength) {
+    if (currentFieldIdx > fields.length-1) {
+      return returnEod();
+    }
+
+    int startFieldValueIdx = currentFieldValueIdx;
+    int startFieldValueOffset = currentFieldValueOffset;
+    
+    do {
+      SolrInputField f = doc.getField(fields[currentFieldIdx]);
+      if (f == null) {
+        log.debug("Field with name {} did not exist on docuemnt.", fields[currentFieldIdx]);
+        incField(sb);
+        continue;
+      }
+      Iterator<Object> fvIt = f.iterator();
+      currentFieldValueIdx = -1;
+      while (fvIt.hasNext() && sb.length() < maxChunkLength) {
+        currentFieldValueIdx++;
+        String fvStr = String.valueOf(fvIt.next());
+        if (currentFieldValueIdx < startFieldValueIdx) continue;
+        startFieldValueIdx = 0;
+        if (sb.length() > 0) {
+          if (maxChunkLength - sb.length() < fieldValueSep.length()) {
+            sb.append(fieldValueSep.substring(0,maxChunkLength - sb.length()));
+          } else {
+            sb.append(fieldValueSep);
+          }
+        }
+        currentFieldValueOffset = startFieldValueOffset;
+        startFieldValueOffset = 0;
+        int charsNeeded = maxChunkLength - sb.length();
+        int endOffset = fvStr.length();
+        if (fvStr.length() - currentFieldValueOffset > charsNeeded) {
+          endOffset = currentFieldValueOffset + charsNeeded;
+        }
+        if (endOffset - currentFieldValueOffset > maxCharsPerFieldValue) {
+          endOffset = maxCharsPerFieldValue - currentFieldValueOffset;
+        }
+        sb.append(fvStr.substring(currentFieldValueOffset, endOffset));
+        currentFieldValueOffset = endOffset == fvStr.length() ? 0 : endOffset;
+      }
+      if (sb.length() >= maxChunkLength) {
+        return returnValue(sb);
+      } else {
+        incField(sb);
+      }
+    } while (currentFieldIdx <= fields.length-1 && sb.length() < maxChunkLength);
+    return sb.length() == 0 ? eodReturnValue : sb.length();
+  }
+
+  private int returnEod() {
+    eod = true;
+    return eodReturnValue;
+  }
+
+  private int returnValue(StringBuilder sb) {
+    if (sb.length() == 0) {
+      return returnEod();
+    } else {
+      return sb.length();
+    }
+  }
+
+  private void incField(StringBuilder sb) {
+    currentFieldIdx++;
+    currentFieldValueIdx = 0;
+    currentFieldValueOffset = 0;
+  }
+
+  @Override
+  public void close() throws IOException { /* ignored */ }
+
+  @Override
+  public boolean ready() throws IOException {
+    return !eod;
+  }
+
+  /**
+   * Choose another return value than -1 for end of document reached.
+   * <b>Warning: Only to work around buggy consumers such as LangDetect 1.1</b>
+   * @param eodReturnValue integer which defaults to -1
+   */
+  public void setEodReturnValue(int eodReturnValue) {
+    this.eodReturnValue = eodReturnValue;
+  }
+
+  /**
+   * Gets the whole reader as a String 
+   * @return string of concatenated fields
+   */
+  public static String asString(Reader reader) {
+    try {
+      return IOUtils.toString(reader);
+    } catch (IOException e) {
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Failed reading doc content from reader", e);
+    }
+  }
+  
+  protected static String[] getStringFields(SolrInputDocument doc) {
+    Iterable<SolrInputField> iterable = () -> doc.iterator();
+        List<String> strFields = StreamSupport.stream(iterable.spliterator(), false)
+            .filter(f -> f.getFirstValue() instanceof String)
+            .map(SolrInputField::getName).collect(Collectors.toList());
+        return strFields.toArray(new String[0]);
+  }
+}
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
@ -16,6 +16,7 @@
 */
 package org.apache.solr.update.processor;

+import java.io.Reader;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.List;
@ -23,8 +24,6 @@ import java.util.List;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.tika.language.LanguageIdentifier;
-
-import org.apache.solr.common.SolrInputDocument;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@ -44,11 +43,11 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd
      SolrQueryResponse rsp, UpdateRequestProcessor next) {
    super(req, rsp, next);
  }
-  
+
  @Override
-  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+  protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
+    String content = SolrInputDocumentReader.asString(solrDocReader);
    List<DetectedLanguage> languages = new ArrayList<>();
-    String content = concatFields(doc);
    if (content.length() != 0) {
      LanguageIdentifier identifier = new LanguageIdentifier(content);
      // FIXME: Hack - we get the distance from toString and calculate our own certainty score
--- a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
+++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
@ -140,14 +140,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    liProcessor = createLangIdProcessor(parameters);
    
    doc = englishDoc();
-    assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
-    assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
+    assertEquals("en", process(doc).getFieldValue("language"));
+    assertEquals("en", process(doc).getFieldValue("languages"));
    
    doc = englishDoc();
    doc.setField("language", "no");
-    assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
-    assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
-    assertNotNull(liProcessor.process(doc).getFieldValue("text_no"));
+    assertEquals("no", process(doc).getFieldValue("language"));
+    assertEquals("no", process(doc).getFieldValue("languages"));
+    assertNotNull(process(doc).getFieldValue("text_no"));
  }

  /**
@ -166,14 +166,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    liProcessor = createLangIdProcessor(parameters);
    
    doc = englishDoc();
-    assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
-    assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
+    assertEquals("en", process(doc).getFieldValue("language"));
+    assertEquals("en", process(doc).getFieldValue("languages"));
    
    doc = englishDoc();
    doc.setField("language", "no");
-    assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
-    assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
-    assertNotNull(liProcessor.process(doc).getFieldValue("text_multivalue_no"));
+    assertEquals("no", process(doc).getFieldValue("language"));
+    assertEquals("no", process(doc).getFieldValue("languages"));
+    assertNotNull(process(doc).getFieldValue("text_multivalue_no"));
  }

  /**
@ -192,14 +192,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    liProcessor = createLangIdProcessor(parameters);

    doc = mixedEnglishRussianDoc();
-    assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
-    assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
+    assertEquals("en", process(doc).getFieldValue("language"));
+    assertEquals("en", process(doc).getFieldValue("languages"));

    doc = mixedEnglishRussianDoc();
    doc.setField("language", "no");
-    assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
-    assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
-    assertNotNull(liProcessor.process(doc).getFieldValue("text_multivalue_no"));
+    assertEquals("no", process(doc).getFieldValue("language"));
+    assertEquals("no", process(doc).getFieldValue("languages"));
+    assertNotNull(process(doc).getFieldValue("text_multivalue_no"));
  }

  @Test
@ -212,7 +212,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    liProcessor = createLangIdProcessor(parameters);
    
    doc = tooShortDoc();
-    assertEquals("", liProcessor.process(doc).getFieldValue("language"));
+    assertEquals("", process(doc).getFieldValue("language"));
  }

  @Test
@ -225,7 +225,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    liProcessor = createLangIdProcessor(parameters);

    doc = new SolrInputDocument();
-    assertEquals("", liProcessor.process(doc).getFieldValue("language"));
+    assertEquals("", process(doc).getFieldValue("language"));
  }

  @Test
@ -242,11 +242,11 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    // Verify fallback to field fb (noop field does not exist and is skipped)
    doc = tooShortDoc();
    doc.addField("fb", "fbField");
-    assertEquals("fbField", liProcessor.process(doc).getFieldValue("language"));
+    assertEquals("fbField", process(doc).getFieldValue("language"));

    // Verify fallback to fallback value since no fallback fields exist
    doc = tooShortDoc();
-    assertEquals("fbVal", liProcessor.process(doc).getFieldValue("language"));  
+    assertEquals("fbVal", process(doc).getFieldValue("language"));  
  }
  
  @Test
@ -272,6 +272,60 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    assertEquals("fallback", liProcessor.resolveLanguage(langs, "fallback"));    
  }
  
+  @Test
+  public void testKeepOrig() throws Exception {
+    ModifiableSolrParams parameters = new ModifiableSolrParams();
+    parameters.set("langid.enforceSchema", "false");
+    parameters.set("langid.langField", "language");
+    parameters.set("langid.langsField", "languages");
+    parameters.set("langid.fl", "text");
+    parameters.set("langid.map", "true");
+    parameters.set("langid.map.keepOrig", "false");
+    liProcessor = createLangIdProcessor(parameters);
+
+    SolrInputDocument mappedNoOrig = process(englishDoc());
+    assertEquals("text_en", liProcessor.getMappedField("text", "en"));
+    assertEquals("en", mappedNoOrig.getFieldValue("language"));
+    assertTrue(mappedNoOrig.containsKey("text_en"));
+    assertFalse(mappedNoOrig.containsKey("text"));
+    
+    // keepOrig true
+    parameters.set("langid.map.keepOrig", "true");
+    liProcessor = createLangIdProcessor(parameters);
+
+    SolrInputDocument mappedKeepOrig = process(englishDoc());
+    assertTrue(mappedKeepOrig.containsKey("text_en"));
+    assertTrue(mappedKeepOrig.containsKey("text"));
+    assertEquals(englishDoc().getFieldValue("text"), mappedKeepOrig.getFieldValue("text_en"));
+    
+    // keepOrig and map individual
+    parameters.set("langid.map.individual", "true");
+    parameters.set("langid.fl", "text,text2");
+    liProcessor = createLangIdProcessor(parameters);
+
+    SolrInputDocument mappedIndividual = process(languagePerFieldDoc());
+    assertTrue(mappedIndividual.containsKey("text_en"));
+    assertTrue(mappedIndividual.containsKey("text"));
+    assertTrue(mappedIndividual.containsKey("text2_ru"));
+    assertTrue(mappedIndividual.containsKey("text2"));
+    assertEquals(languagePerFieldDoc().getFieldValue("text"), mappedIndividual.getFieldValue("text_en"));
+  }
+
+  @Test
+  public void testMapIndividual() throws Exception {
+    ModifiableSolrParams parameters = new ModifiableSolrParams();
+    parameters.set("langid.enforceSchema", "false");
+    parameters.set("langid.langField", "language");
+    parameters.set("langid.langsField", "languages");
+    parameters.set("langid.fl", "text,text2");
+    parameters.set("langid.map", "true");
+    parameters.set("langid.map.individual", "true");
+    liProcessor = createLangIdProcessor(parameters);
+
+    SolrInputDocument mappedIndividual = process(languagePerFieldDoc());
+    assertTrue(mappedIndividual.containsKey("text_en"));
+    assertTrue(mappedIndividual.containsKey("text2_ru"));
+  }
  
  // Various utility methods
  
@ -282,6 +336,12 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    return doc;
  }

+  private SolrInputDocument languagePerFieldDoc() {
+    SolrInputDocument doc = englishDoc();
+    doc.addField("text2", "The Apache Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска, написанная на Java. Может быть использована для поиска в интернете и других областях компьютерной лингвистики (аналитическая философия).");
+    return doc;
+  }
+  
  /**
   * Construct document containing multi-value fields in different languages.
   * @return solr input document
@ -307,7 +367,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    if(liProcessor == null)
      throw new Exception("Processor must be initialized before calling assertLang()");
    SolrInputDocument doc = sid(fieldsAndValues);
-    assertEquals(langCode, liProcessor.process(doc).getFieldValue(liProcessor.langField));
+    assertEquals(langCode, process(doc).getFieldValue(liProcessor.langField));
  }
  
  private SolrInputDocument sid(String... fieldsAndValues) {
@ -317,4 +377,13 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    }
    return doc;
  }
+  
+  /*
+  Utility test method to process a clone of a document
+   */
+  private SolrInputDocument process(SolrInputDocument origDoc) {
+    SolrInputDocument modifiedDoc = origDoc.deepCopy();
+    liProcessor.process(modifiedDoc);
+    return modifiedDoc;
+  }
 }
--- a/solr/contrib/langid/src/test/org/apache/solr/update/processor/SolrInputDocumentReaderTest.java
+++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/SolrInputDocumentReaderTest.java
@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.util.Arrays;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class SolrInputDocumentReaderTest {
+  private SolrInputDocument doc;
+  private String[] allFields;
+
+  @Before
+  public void setUp() throws Exception {
+    doc = new SolrInputDocument();
+    doc.addField("f1", "a b c");
+    doc.addField("f2", "multi");
+    doc.addField("f2", "valued");
+    doc.addField("f2", "field");
+    doc.addField("f3", 123);
+    doc.addField("f4", "12345678901234567890");
+    allFields = new String[] {"f1", "f2", "f3", "f4"};
+  }
+  
+  @Test
+  public void readChunked() throws Exception {
+    SolrInputDocumentReader reader = new SolrInputDocumentReader(
+        doc,
+        allFields,
+        20,
+        18,
+        " - ");
+    assertTrue(reader.ready());
+    char[] chars = new char[1000];
+    int len;
+    assertEquals(9, len=reader.read(chars, 0, 9));
+    assertArrEqu("a b c - m", chars, len);
+    len += reader.read(chars, 9, 2);
+    assertArrEqu("a b c - mul", chars, len);
+    len += reader.read(chars, 11, 1);
+    assertArrEqu("a b c - mult", chars, len);
+    len += reader.read(chars, 12, 10);
+    // We now hit totalMaxChars
+    assertArrEqu("a b c - multi - valu", chars, len);
+  }
+
+  @Test
+  public void maxFieldValueLength() throws Exception {
+    SolrInputDocumentReader reader = new SolrInputDocumentReader(
+        doc,
+        allFields,
+        21,
+        2,
+        " - "
+    );
+    assertTrue(reader.ready());
+    char[] chars = new char[1000];
+    int len = reader.read(chars, 0, 22);
+    assertEquals(21, len);
+    assertArrEqu("a  - mu - va - fi - 1", chars, len);
+  }
+
+  @Test
+  public void allStrFields() throws Exception {
+    SolrInputDocumentReader reader = new SolrInputDocumentReader(
+        doc,
+        20000,
+        10000
+    );
+    assertTrue(reader.ready());
+    char[] chars = new char[1000];
+    int len = reader.read(chars, 0, 1000);
+    assertEquals(45, len);
+    assertArrEqu("a b c multi valued field 12345678901234567890", chars, len);
+  }
+  
+  @Test
+  public void testGetStringFields() throws Exception {
+    String[] expected = new String[] {"f1", "f2", "f4"};
+    assertArrayEquals(expected, SolrInputDocumentReader.getStringFields(doc));
+  }
+
+  private void assertArrEqu(String expected, char[] chars, int len) {
+    String str = new String(Arrays.copyOf(chars, len));
+    assertEquals(expected, str);
+  }
+
+}