SOLR-3881: Avoid OOMs in LanguageIdentifierUpdateProcessor:

- Added langid.maxFieldValueChars and langid.maxTotalChars params to limit input, by default 10k and 20k chars, respectively. - Moved input concatenation to Tika implementation; the langdetect implementation instead appends each input piece via the langdetect API. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1643377 13f79535-47bb-0310-9956-ffa450edef68
2014-12-05 18:22:30 +00:00 · 2014-12-05 18:22:30 +00:00 · d72e3dfb0f
parent 846e457684
commit d72e3dfb0f
6 changed files with 274 additions and 41 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -296,6 +296,13 @@ Bug Fixes
 * SOLR-6763: Shard leader elections should not persist across session expiry
  (Alan Woodward, Mark Miller)
 * SOLR-3881: Avoid OOMs in LanguageIdentifierUpdateProcessor:
  - Added langid.maxFieldValueChars and langid.maxTotalChars params to limit
    input, by default 10k and 20k chars, respectively.
  - Moved input concatenation to Tika implementation; the langdetect
    implementation instead appends each input piece via the langdetect API.
  (Vitaliy Zhovtyuk, Tomás Fernández Löbbe, Rob Tulloh, Steve Rowe)
 Optimizations
 ----------------------
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
@ -18,6 +18,7 @@ package org.apache.solr.update.processor;
 */
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
@ -28,6 +29,7 @@ import com.cybozu.labs.langdetect.Detector;
 import com.cybozu.labs.langdetect.DetectorFactory;
 import com.cybozu.labs.langdetect.LangDetectException;
 import com.cybozu.labs.langdetect.Language;
 import org.apache.solr.common.SolrInputDocument;
 /**
 * Identifies the language of a set of input fields using http://code.google.com/p/language-detection
@ -43,15 +45,32 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
  }
  @Override
-  protected List<DetectedLanguage> detectLanguage(String content) {
+  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
    if (content.trim().length() == 0) { // to be consistent with the tika impl?
      log.debug("No input text to detect language from, returning empty list");
      return Collections.emptyList();
    }
    try {
      Detector detector = DetectorFactory.create();
-      detector.append(content);
+      detector.setMaxTextLength(maxTotalChars);
      for (String fieldName : inputFields) {
        log.debug("Appending field " + fieldName);
        if (doc.containsKey(fieldName)) {
          Collection<Object> fieldValues = doc.getFieldValues(fieldName);
          if (fieldValues != null) {
            for (Object content : fieldValues) {
              if (content instanceof String) {
                String stringContent = (String) content;
                if (stringContent.length() > maxFieldValueChars) {
                  detector.append(stringContent.substring(0, maxFieldValueChars));
                } else {
                  detector.append(stringContent);
                }
                detector.append(" ");
              } else {
                log.warn("Field " + fieldName + " not a String value, not including in detection");
              }
            }
          }
        }
      }
      ArrayList<Language> langlist = detector.getProbabilities();
      ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
      for (Language l: langlist) {
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
@ -41,12 +41,16 @@ public interface LangIdParams {
  String MAP_LCMAP =  LANGUAGE_ID + ".map.lcmap";            // Enables mapping multiple langs to same output field
  String MAP_PATTERN =  LANGUAGE_ID + ".map.pattern";        // RegEx pattern to match field name
  String MAP_REPLACE =  LANGUAGE_ID + ".map.replace";        // Replace pattern
  String MAX_FIELD_VALUE_CHARS = LANGUAGE_ID + ".maxFieldValueChars";   // Maximum number of characters to use per field for language detection
  String MAX_TOTAL_CHARS = LANGUAGE_ID + ".maxTotalChars";   // Maximum number of characters to use per all concatenated fields for language detection
  String DOCID_FIELD_DEFAULT = "id";
  String DOCID_LANGFIELD_DEFAULT = null;
  String DOCID_LANGSFIELD_DEFAULT = null;
  String MAP_PATTERN_DEFAULT = "(.*)";
  String MAP_REPLACE_DEFAULT = "$1_{lang}";
  int MAX_FIELD_VALUE_CHARS_DEFAULT = 10000;
  int MAX_TOTAL_CHARS_DEFAULT = 20000;
  // TODO: This default threshold accepts even "uncertain" detections. 
  // Increase &langid.threshold above 0.5 to return only certain detections
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
@ -78,6 +78,8 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
  protected HashMap<String,String> lcMap;
  protected HashMap<String,String> mapLcMap;
  protected IndexSchema schema;
  protected int maxFieldValueChars;
  protected int maxTotalChars;
  // Regex patterns
  protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)");
@ -169,8 +171,21 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
      mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT));
      mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT);
-
+      maxFieldValueChars = params.getInt(MAX_FIELD_VALUE_CHARS, MAX_FIELD_VALUE_CHARS_DEFAULT);
-
+      maxTotalChars = params.getInt(MAX_TOTAL_CHARS, MAX_TOTAL_CHARS_DEFAULT);
      if (maxFieldValueChars > maxTotalChars) {
        if (maxTotalChars == MAX_TOTAL_CHARS_DEFAULT) {
          // If the user specified only maxFieldValueChars, make maxTotalChars the same as it
          log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
              + maxTotalChars + ").  Setting " + MAX_TOTAL_CHARS + " to " + maxFieldValueChars + ".");
          maxTotalChars = maxFieldValueChars;
        } else {
          // If the user specified maxTotalChars, make maxFieldValueChars the same as it
          log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
              + maxTotalChars + ").  Setting " + MAX_FIELD_VALUE_CHARS + " to " + maxTotalChars + ".");
          maxFieldValueChars = maxTotalChars;
        }
      }
    }
    log.debug("LangId configured");
@ -203,11 +218,10 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
    String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
    if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) {
-      String allText = concatFields(doc, inputFields);
+      List<DetectedLanguage> languagelist = detectLanguage(doc);
      List<DetectedLanguage> languagelist = detectLanguage(allText);
      docLang = resolveLanguage(languagelist, fallbackLang);
      docLangs.add(docLang);
-      log.debug("Detected main document language from fields "+inputFields+": "+docLang);
+      log.debug("Detected main document language from fields "+ Arrays.toString(inputFields) +": "+docLang);
      if(doc.containsKey(langField) && overwrite) {
        log.debug("Overwritten old value "+doc.getFieldValue(langField));
@ -227,8 +241,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
        if(doc.containsKey(fieldName)) {
          String fieldLang;
          if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
-            String text = (String) doc.getFieldValue(fieldName);
+            List<DetectedLanguage> languagelist = detectLanguage(doc);
            List<DetectedLanguage> languagelist = detectLanguage(text);
            fieldLang = resolveLanguage(languagelist, docLang);
            docLangs.add(fieldLang);
            log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
@ -284,37 +297,13 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
    return lang;
  }
  /*
   * Concatenates content from multiple fields
   */
  protected String concatFields(SolrInputDocument doc, String[] fields) {
    StringBuilder sb = new StringBuilder();
    for (String fieldName : inputFields) {
      log.debug("Appending field "+fieldName);
      if (doc.containsKey(fieldName)) {
        Collection<Object> fieldValues = doc.getFieldValues(fieldName);
        if (fieldValues != null) {
          for (Object content : fieldValues) {
            if (content instanceof String) {
              sb.append((String) content);
              sb.append(" ");
            } else {
              log.warn("Field " + fieldName + " not a String value, not including in detection");
            }
          }
        }
      }
    }
    return sb.toString();
  }
  /**
   * Detects language(s) from a string.
   * Classes wishing to implement their own language detection module should override this method.
   * @param content The content to identify
   * @return List of detected language(s) according to RFC-3066
   */
-  protected abstract List<DetectedLanguage> detectLanguage(String content);
+  protected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content);
  /**
   * Chooses a language based on the list of candidates detected
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
@ -24,6 +24,9 @@ import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.tika.language.LanguageIdentifier;
 import org.apache.solr.common.SolrInputDocument;
 import java.util.Collection;
 /**
 * Identifies the language of a set of input fields using Tika's
 * LanguageIdentifier.
@ -40,9 +43,10 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd
  }
  @Override
-  protected List<DetectedLanguage> detectLanguage(String content) {
+  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
    List<DetectedLanguage> languages = new ArrayList<>();
-    if(content.trim().length() != 0) { 
+    String content = concatFields(doc);
    if (content.length() != 0) {
      LanguageIdentifier identifier = new LanguageIdentifier(content);
      // FIXME: Hack - we get the distance from toString and calculate our own certainty score
      Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
@ -57,4 +61,59 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd
    }
    return languages;
  }
  /**
   * Concatenates content from multiple fields
   */
  protected String concatFields(SolrInputDocument doc) {
    StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
    for (String fieldName : inputFields) {
      log.debug("Appending field " + fieldName);
      if (doc.containsKey(fieldName)) {
        Collection<Object> fieldValues = doc.getFieldValues(fieldName);
        if (fieldValues != null) {
          for (Object content : fieldValues) {
            if (content instanceof String) {
              String stringContent = (String) content;
              if (stringContent.length() > maxFieldValueChars) {
                sb.append(stringContent.substring(0, maxFieldValueChars));
              } else {
                sb.append(stringContent);
 }
              sb.append(" ");
              if (sb.length() > maxTotalChars) {
                sb.setLength(maxTotalChars);
                break;
              }
            } else {
              log.warn("Field " + fieldName + " not a String value, not including in detection");
            }
          }
        }
      }
    }
    return sb.toString();
  }
  /**
   * Calculate expected string size.
   *
   * @param doc           solr input document
   * @param fields        fields to select
   * @return expected size of string value
   */
  private int getExpectedSize(SolrInputDocument doc, String[] fields) {
    int docSize = 0;
    for (String field : fields) {
      Collection<Object> contents = doc.getFieldValues(field);
      for (Object content : contents) {
        if (content instanceof String) {
          docSize += Math.min(((String) content).length(), maxFieldValueChars);
        }
      }
      docSize = Math.min(docSize, maxTotalChars);
    }
    return docSize;
  }
 }
--- a/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java
+++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java
@ -17,11 +17,166 @@ package org.apache.solr.update.processor;
 * limitations under the License.
 */
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.junit.Test;
 public class TikaLanguageIdentifierUpdateProcessorFactoryTest extends LanguageIdentifierUpdateProcessorFactoryTestCase {
  @Override
  protected LanguageIdentifierUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception {
    return new TikaLanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(h.getCore(), parameters, null), resp, null);
  }
  @Test
  public void testMaxFieldValueChars() throws Exception {
    SolrInputDocument doc = new SolrInputDocument();
    String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
    String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
    doc.addField("foo_s", valueF1);
    ModifiableSolrParams parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals(valueF1, p.concatFields(doc).trim());
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    parameters.add("langid.maxFieldValueChars", "6");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals("Apache", p.concatFields(doc).trim());
    doc.addField("bar_s", valueF2);
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s,bar_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s,bar_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    parameters.add("langid.maxFieldValueChars", "6");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals("Apache" + " " + "An ope", p.concatFields(doc).trim());
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s,bar_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    parameters.add("langid.maxFieldValueChars", "100000");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
 }
  @Test
  public void testMaxTotalChars() throws Exception {
    SolrInputDocument doc = new SolrInputDocument();
    String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
    String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
    doc.addField("foo_s", valueF1);
    ModifiableSolrParams parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals(valueF1, p.concatFields(doc).trim());
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    parameters.add("langid.maxTotalChars", "6");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals("Apache", p.concatFields(doc).trim());
    doc.addField("bar_s", valueF2);
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s,bar_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s,bar_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    parameters.add("langid.maxTotalChars", "6");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals("Apache", p.concatFields(doc).trim());
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s,bar_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    parameters.add("langid.maxTotalChars", "100000");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
  }
  @Test
  public void testMaxFieldValueCharsAndMaxTotalChars() throws Exception {
    SolrInputDocument doc = new SolrInputDocument();
    String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
    String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
    doc.addField("foo_s", valueF1);
    ModifiableSolrParams parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals(valueF1, p.concatFields(doc).trim());
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    parameters.add("langid.maxFieldValueChars", "8");
    parameters.add("langid.maxTotalChars", "6");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals("Apache", p.concatFields(doc).trim());
    doc.addField("bar_s", valueF2);
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s,bar_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s,bar_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    parameters.add("langid.maxFieldValueChars", "3");
    parameters.add("langid.maxTotalChars", "8");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals("Apa An", p.concatFields(doc).trim());
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "foo_s,bar_s");
    parameters.add("langid.langField", "language");
    parameters.add("langid.enforceSchema", "false");
    parameters.add("langid.maxFieldValueChars", "10000");
    parameters.add("langid.maxTotalChars", "100000");
    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
  }
 }