SOLR-3881: Avoid OOMs in LanguageIdentifierUpdateProcessor:

- Added langid.maxFieldValueChars and langid.maxTotalChars params to limit
  input, by default 10k and 20k chars, respectively.
- Moved input concatenation to Tika implementation; the langdetect
  implementation instead appends each input piece via the langdetect API.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1643377 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-12-05 18:22:30 +00:00
parent 846e457684
commit d72e3dfb0f
6 changed files with 274 additions and 41 deletions

View File

@ -296,6 +296,13 @@ Bug Fixes
* SOLR-6763: Shard leader elections should not persist across session expiry * SOLR-6763: Shard leader elections should not persist across session expiry
(Alan Woodward, Mark Miller) (Alan Woodward, Mark Miller)
* SOLR-3881: Avoid OOMs in LanguageIdentifierUpdateProcessor:
- Added langid.maxFieldValueChars and langid.maxTotalChars params to limit
input, by default 10k and 20k chars, respectively.
- Moved input concatenation to Tika implementation; the langdetect
implementation instead appends each input piece via the langdetect API.
(Vitaliy Zhovtyuk, Tomás Fernández Löbbe, Rob Tulloh, Steve Rowe)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -18,6 +18,7 @@ package org.apache.solr.update.processor;
*/ */
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
@ -28,6 +29,7 @@ import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory; import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException; import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language; import com.cybozu.labs.langdetect.Language;
import org.apache.solr.common.SolrInputDocument;
/** /**
* Identifies the language of a set of input fields using http://code.google.com/p/language-detection * Identifies the language of a set of input fields using http://code.google.com/p/language-detection
@ -43,15 +45,32 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
} }
@Override @Override
protected List<DetectedLanguage> detectLanguage(String content) { protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
if (content.trim().length() == 0) { // to be consistent with the tika impl?
log.debug("No input text to detect language from, returning empty list");
return Collections.emptyList();
}
try { try {
Detector detector = DetectorFactory.create(); Detector detector = DetectorFactory.create();
detector.append(content); detector.setMaxTextLength(maxTotalChars);
for (String fieldName : inputFields) {
log.debug("Appending field " + fieldName);
if (doc.containsKey(fieldName)) {
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
if (fieldValues != null) {
for (Object content : fieldValues) {
if (content instanceof String) {
String stringContent = (String) content;
if (stringContent.length() > maxFieldValueChars) {
detector.append(stringContent.substring(0, maxFieldValueChars));
} else {
detector.append(stringContent);
}
detector.append(" ");
} else {
log.warn("Field " + fieldName + " not a String value, not including in detection");
}
}
}
}
}
ArrayList<Language> langlist = detector.getProbabilities(); ArrayList<Language> langlist = detector.getProbabilities();
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>(); ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
for (Language l: langlist) { for (Language l: langlist) {

View File

@ -41,12 +41,16 @@ public interface LangIdParams {
String MAP_LCMAP = LANGUAGE_ID + ".map.lcmap"; // Enables mapping multiple langs to same output field String MAP_LCMAP = LANGUAGE_ID + ".map.lcmap"; // Enables mapping multiple langs to same output field
String MAP_PATTERN = LANGUAGE_ID + ".map.pattern"; // RegEx pattern to match field name String MAP_PATTERN = LANGUAGE_ID + ".map.pattern"; // RegEx pattern to match field name
String MAP_REPLACE = LANGUAGE_ID + ".map.replace"; // Replace pattern String MAP_REPLACE = LANGUAGE_ID + ".map.replace"; // Replace pattern
String MAX_FIELD_VALUE_CHARS = LANGUAGE_ID + ".maxFieldValueChars"; // Maximum number of characters to use per field for language detection
String MAX_TOTAL_CHARS = LANGUAGE_ID + ".maxTotalChars"; // Maximum number of characters to use per all concatenated fields for language detection
String DOCID_FIELD_DEFAULT = "id"; String DOCID_FIELD_DEFAULT = "id";
String DOCID_LANGFIELD_DEFAULT = null; String DOCID_LANGFIELD_DEFAULT = null;
String DOCID_LANGSFIELD_DEFAULT = null; String DOCID_LANGSFIELD_DEFAULT = null;
String MAP_PATTERN_DEFAULT = "(.*)"; String MAP_PATTERN_DEFAULT = "(.*)";
String MAP_REPLACE_DEFAULT = "$1_{lang}"; String MAP_REPLACE_DEFAULT = "$1_{lang}";
int MAX_FIELD_VALUE_CHARS_DEFAULT = 10000;
int MAX_TOTAL_CHARS_DEFAULT = 20000;
// TODO: This default threshold accepts even "uncertain" detections. // TODO: This default threshold accepts even "uncertain" detections.
// Increase &langid.threshold above 0.5 to return only certain detections // Increase &langid.threshold above 0.5 to return only certain detections

View File

@ -78,6 +78,8 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
protected HashMap<String,String> lcMap; protected HashMap<String,String> lcMap;
protected HashMap<String,String> mapLcMap; protected HashMap<String,String> mapLcMap;
protected IndexSchema schema; protected IndexSchema schema;
protected int maxFieldValueChars;
protected int maxTotalChars;
// Regex patterns // Regex patterns
protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)"); protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)");
@ -169,8 +171,21 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT)); mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT));
mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT); mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT);
maxFieldValueChars = params.getInt(MAX_FIELD_VALUE_CHARS, MAX_FIELD_VALUE_CHARS_DEFAULT);
maxTotalChars = params.getInt(MAX_TOTAL_CHARS, MAX_TOTAL_CHARS_DEFAULT);
if (maxFieldValueChars > maxTotalChars) {
if (maxTotalChars == MAX_TOTAL_CHARS_DEFAULT) {
// If the user specified only maxFieldValueChars, make maxTotalChars the same as it
log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
+ maxTotalChars + "). Setting " + MAX_TOTAL_CHARS + " to " + maxFieldValueChars + ".");
maxTotalChars = maxFieldValueChars;
} else {
// If the user specified maxTotalChars, make maxFieldValueChars the same as it
log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
+ maxTotalChars + "). Setting " + MAX_FIELD_VALUE_CHARS + " to " + maxTotalChars + ".");
maxFieldValueChars = maxTotalChars;
}
}
} }
log.debug("LangId configured"); log.debug("LangId configured");
@ -203,11 +218,10 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue); String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) { if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) {
String allText = concatFields(doc, inputFields); List<DetectedLanguage> languagelist = detectLanguage(doc);
List<DetectedLanguage> languagelist = detectLanguage(allText);
docLang = resolveLanguage(languagelist, fallbackLang); docLang = resolveLanguage(languagelist, fallbackLang);
docLangs.add(docLang); docLangs.add(docLang);
log.debug("Detected main document language from fields "+inputFields+": "+docLang); log.debug("Detected main document language from fields "+ Arrays.toString(inputFields) +": "+docLang);
if(doc.containsKey(langField) && overwrite) { if(doc.containsKey(langField) && overwrite) {
log.debug("Overwritten old value "+doc.getFieldValue(langField)); log.debug("Overwritten old value "+doc.getFieldValue(langField));
@ -227,8 +241,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
if(doc.containsKey(fieldName)) { if(doc.containsKey(fieldName)) {
String fieldLang; String fieldLang;
if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) { if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
String text = (String) doc.getFieldValue(fieldName); List<DetectedLanguage> languagelist = detectLanguage(doc);
List<DetectedLanguage> languagelist = detectLanguage(text);
fieldLang = resolveLanguage(languagelist, docLang); fieldLang = resolveLanguage(languagelist, docLang);
docLangs.add(fieldLang); docLangs.add(fieldLang);
log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang); log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
@ -284,37 +297,13 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
return lang; return lang;
} }
/*
* Concatenates content from multiple fields
*/
protected String concatFields(SolrInputDocument doc, String[] fields) {
StringBuilder sb = new StringBuilder();
for (String fieldName : inputFields) {
log.debug("Appending field "+fieldName);
if (doc.containsKey(fieldName)) {
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
if (fieldValues != null) {
for (Object content : fieldValues) {
if (content instanceof String) {
sb.append((String) content);
sb.append(" ");
} else {
log.warn("Field " + fieldName + " not a String value, not including in detection");
}
}
}
}
}
return sb.toString();
}
/** /**
* Detects language(s) from a string. * Detects language(s) from a string.
* Classes wishing to implement their own language detection module should override this method. * Classes wishing to implement their own language detection module should override this method.
* @param content The content to identify * @param content The content to identify
* @return List of detected language(s) according to RFC-3066 * @return List of detected language(s) according to RFC-3066
*/ */
protected abstract List<DetectedLanguage> detectLanguage(String content); protected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content);
/** /**
* Chooses a language based on the list of candidates detected * Chooses a language based on the list of candidates detected

View File

@ -24,6 +24,9 @@ import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.response.SolrQueryResponse;
import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.language.LanguageIdentifier;
import org.apache.solr.common.SolrInputDocument;
import java.util.Collection;
/** /**
* Identifies the language of a set of input fields using Tika's * Identifies the language of a set of input fields using Tika's
* LanguageIdentifier. * LanguageIdentifier.
@ -40,9 +43,10 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd
} }
@Override @Override
protected List<DetectedLanguage> detectLanguage(String content) { protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
List<DetectedLanguage> languages = new ArrayList<>(); List<DetectedLanguage> languages = new ArrayList<>();
if(content.trim().length() != 0) { String content = concatFields(doc);
if (content.length() != 0) {
LanguageIdentifier identifier = new LanguageIdentifier(content); LanguageIdentifier identifier = new LanguageIdentifier(content);
// FIXME: Hack - we get the distance from toString and calculate our own certainty score // FIXME: Hack - we get the distance from toString and calculate our own certainty score
Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1")); Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
@ -57,4 +61,59 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd
} }
return languages; return languages;
} }
/**
* Concatenates content from multiple fields
*/
protected String concatFields(SolrInputDocument doc) {
StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
for (String fieldName : inputFields) {
log.debug("Appending field " + fieldName);
if (doc.containsKey(fieldName)) {
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
if (fieldValues != null) {
for (Object content : fieldValues) {
if (content instanceof String) {
String stringContent = (String) content;
if (stringContent.length() > maxFieldValueChars) {
sb.append(stringContent.substring(0, maxFieldValueChars));
} else {
sb.append(stringContent);
}
sb.append(" ");
if (sb.length() > maxTotalChars) {
sb.setLength(maxTotalChars);
break;
}
} else {
log.warn("Field " + fieldName + " not a String value, not including in detection");
}
}
}
}
}
return sb.toString();
}
/**
* Calculate expected string size.
*
* @param doc solr input document
* @param fields fields to select
* @return expected size of string value
*/
private int getExpectedSize(SolrInputDocument doc, String[] fields) {
int docSize = 0;
for (String field : fields) {
Collection<Object> contents = doc.getFieldValues(field);
for (Object content : contents) {
if (content instanceof String) {
docSize += Math.min(((String) content).length(), maxFieldValueChars);
}
}
docSize = Math.min(docSize, maxTotalChars);
}
return docSize;
}
} }

View File

@ -17,11 +17,166 @@ package org.apache.solr.update.processor;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ModifiableSolrParams;
import org.junit.Test;
public class TikaLanguageIdentifierUpdateProcessorFactoryTest extends LanguageIdentifierUpdateProcessorFactoryTestCase { public class TikaLanguageIdentifierUpdateProcessorFactoryTest extends LanguageIdentifierUpdateProcessorFactoryTestCase {
@Override @Override
protected LanguageIdentifierUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception { protected LanguageIdentifierUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception {
return new TikaLanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(h.getCore(), parameters, null), resp, null); return new TikaLanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(h.getCore(), parameters, null), resp, null);
} }
@Test
public void testMaxFieldValueChars() throws Exception {
SolrInputDocument doc = new SolrInputDocument();
String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
doc.addField("foo_s", valueF1);
ModifiableSolrParams parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals(valueF1, p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.maxFieldValueChars", "6");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals("Apache", p.concatFields(doc).trim());
doc.addField("bar_s", valueF2);
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s,bar_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s,bar_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.maxFieldValueChars", "6");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals("Apache" + " " + "An ope", p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s,bar_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.maxFieldValueChars", "100000");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
}
@Test
public void testMaxTotalChars() throws Exception {
SolrInputDocument doc = new SolrInputDocument();
String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
doc.addField("foo_s", valueF1);
ModifiableSolrParams parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals(valueF1, p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.maxTotalChars", "6");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals("Apache", p.concatFields(doc).trim());
doc.addField("bar_s", valueF2);
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s,bar_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s,bar_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.maxTotalChars", "6");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals("Apache", p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s,bar_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.maxTotalChars", "100000");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
}
@Test
public void testMaxFieldValueCharsAndMaxTotalChars() throws Exception {
SolrInputDocument doc = new SolrInputDocument();
String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
doc.addField("foo_s", valueF1);
ModifiableSolrParams parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals(valueF1, p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.maxFieldValueChars", "8");
parameters.add("langid.maxTotalChars", "6");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals("Apache", p.concatFields(doc).trim());
doc.addField("bar_s", valueF2);
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s,bar_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s,bar_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.maxFieldValueChars", "3");
parameters.add("langid.maxTotalChars", "8");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals("Apa An", p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s,bar_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.maxFieldValueChars", "10000");
parameters.add("langid.maxTotalChars", "100000");
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
}
} }