mirror of https://github.com/apache/lucene.git
SOLR-3881: Avoid OOMs in LanguageIdentifierUpdateProcessor:
- Added langid.maxFieldValueChars and langid.maxTotalChars params to limit input, by default 10k and 20k chars, respectively. - Moved input concatenation to Tika implementation; the langdetect implementation instead appends each input piece via the langdetect API. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1643377 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
846e457684
commit
d72e3dfb0f
|
@ -296,6 +296,13 @@ Bug Fixes
|
||||||
|
|
||||||
* SOLR-6763: Shard leader elections should not persist across session expiry
|
* SOLR-6763: Shard leader elections should not persist across session expiry
|
||||||
(Alan Woodward, Mark Miller)
|
(Alan Woodward, Mark Miller)
|
||||||
|
|
||||||
|
* SOLR-3881: Avoid OOMs in LanguageIdentifierUpdateProcessor:
|
||||||
|
- Added langid.maxFieldValueChars and langid.maxTotalChars params to limit
|
||||||
|
input, by default 10k and 20k chars, respectively.
|
||||||
|
- Moved input concatenation to Tika implementation; the langdetect
|
||||||
|
implementation instead appends each input piece via the langdetect API.
|
||||||
|
(Vitaliy Zhovtyuk, Tomás Fernández Löbbe, Rob Tulloh, Steve Rowe)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.solr.update.processor;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -28,6 +29,7 @@ import com.cybozu.labs.langdetect.Detector;
|
||||||
import com.cybozu.labs.langdetect.DetectorFactory;
|
import com.cybozu.labs.langdetect.DetectorFactory;
|
||||||
import com.cybozu.labs.langdetect.LangDetectException;
|
import com.cybozu.labs.langdetect.LangDetectException;
|
||||||
import com.cybozu.labs.langdetect.Language;
|
import com.cybozu.labs.langdetect.Language;
|
||||||
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Identifies the language of a set of input fields using http://code.google.com/p/language-detection
|
* Identifies the language of a set of input fields using http://code.google.com/p/language-detection
|
||||||
|
@ -43,15 +45,32 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<DetectedLanguage> detectLanguage(String content) {
|
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
|
||||||
if (content.trim().length() == 0) { // to be consistent with the tika impl?
|
|
||||||
log.debug("No input text to detect language from, returning empty list");
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
Detector detector = DetectorFactory.create();
|
Detector detector = DetectorFactory.create();
|
||||||
detector.append(content);
|
detector.setMaxTextLength(maxTotalChars);
|
||||||
|
|
||||||
|
for (String fieldName : inputFields) {
|
||||||
|
log.debug("Appending field " + fieldName);
|
||||||
|
if (doc.containsKey(fieldName)) {
|
||||||
|
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
|
||||||
|
if (fieldValues != null) {
|
||||||
|
for (Object content : fieldValues) {
|
||||||
|
if (content instanceof String) {
|
||||||
|
String stringContent = (String) content;
|
||||||
|
if (stringContent.length() > maxFieldValueChars) {
|
||||||
|
detector.append(stringContent.substring(0, maxFieldValueChars));
|
||||||
|
} else {
|
||||||
|
detector.append(stringContent);
|
||||||
|
}
|
||||||
|
detector.append(" ");
|
||||||
|
} else {
|
||||||
|
log.warn("Field " + fieldName + " not a String value, not including in detection");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
ArrayList<Language> langlist = detector.getProbabilities();
|
ArrayList<Language> langlist = detector.getProbabilities();
|
||||||
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
|
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
|
||||||
for (Language l: langlist) {
|
for (Language l: langlist) {
|
||||||
|
|
|
@ -41,12 +41,16 @@ public interface LangIdParams {
|
||||||
String MAP_LCMAP = LANGUAGE_ID + ".map.lcmap"; // Enables mapping multiple langs to same output field
|
String MAP_LCMAP = LANGUAGE_ID + ".map.lcmap"; // Enables mapping multiple langs to same output field
|
||||||
String MAP_PATTERN = LANGUAGE_ID + ".map.pattern"; // RegEx pattern to match field name
|
String MAP_PATTERN = LANGUAGE_ID + ".map.pattern"; // RegEx pattern to match field name
|
||||||
String MAP_REPLACE = LANGUAGE_ID + ".map.replace"; // Replace pattern
|
String MAP_REPLACE = LANGUAGE_ID + ".map.replace"; // Replace pattern
|
||||||
|
String MAX_FIELD_VALUE_CHARS = LANGUAGE_ID + ".maxFieldValueChars"; // Maximum number of characters to use per field for language detection
|
||||||
|
String MAX_TOTAL_CHARS = LANGUAGE_ID + ".maxTotalChars"; // Maximum number of characters to use per all concatenated fields for language detection
|
||||||
|
|
||||||
String DOCID_FIELD_DEFAULT = "id";
|
String DOCID_FIELD_DEFAULT = "id";
|
||||||
String DOCID_LANGFIELD_DEFAULT = null;
|
String DOCID_LANGFIELD_DEFAULT = null;
|
||||||
String DOCID_LANGSFIELD_DEFAULT = null;
|
String DOCID_LANGSFIELD_DEFAULT = null;
|
||||||
String MAP_PATTERN_DEFAULT = "(.*)";
|
String MAP_PATTERN_DEFAULT = "(.*)";
|
||||||
String MAP_REPLACE_DEFAULT = "$1_{lang}";
|
String MAP_REPLACE_DEFAULT = "$1_{lang}";
|
||||||
|
int MAX_FIELD_VALUE_CHARS_DEFAULT = 10000;
|
||||||
|
int MAX_TOTAL_CHARS_DEFAULT = 20000;
|
||||||
|
|
||||||
// TODO: This default threshold accepts even "uncertain" detections.
|
// TODO: This default threshold accepts even "uncertain" detections.
|
||||||
// Increase &langid.threshold above 0.5 to return only certain detections
|
// Increase &langid.threshold above 0.5 to return only certain detections
|
||||||
|
|
|
@ -78,6 +78,8 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
||||||
protected HashMap<String,String> lcMap;
|
protected HashMap<String,String> lcMap;
|
||||||
protected HashMap<String,String> mapLcMap;
|
protected HashMap<String,String> mapLcMap;
|
||||||
protected IndexSchema schema;
|
protected IndexSchema schema;
|
||||||
|
protected int maxFieldValueChars;
|
||||||
|
protected int maxTotalChars;
|
||||||
|
|
||||||
// Regex patterns
|
// Regex patterns
|
||||||
protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)");
|
protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)");
|
||||||
|
@ -169,8 +171,21 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
||||||
|
|
||||||
mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT));
|
mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT));
|
||||||
mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT);
|
mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT);
|
||||||
|
maxFieldValueChars = params.getInt(MAX_FIELD_VALUE_CHARS, MAX_FIELD_VALUE_CHARS_DEFAULT);
|
||||||
|
maxTotalChars = params.getInt(MAX_TOTAL_CHARS, MAX_TOTAL_CHARS_DEFAULT);
|
||||||
|
if (maxFieldValueChars > maxTotalChars) {
|
||||||
|
if (maxTotalChars == MAX_TOTAL_CHARS_DEFAULT) {
|
||||||
|
// If the user specified only maxFieldValueChars, make maxTotalChars the same as it
|
||||||
|
log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
|
||||||
|
+ maxTotalChars + "). Setting " + MAX_TOTAL_CHARS + " to " + maxFieldValueChars + ".");
|
||||||
|
maxTotalChars = maxFieldValueChars;
|
||||||
|
} else {
|
||||||
|
// If the user specified maxTotalChars, make maxFieldValueChars the same as it
|
||||||
|
log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
|
||||||
|
+ maxTotalChars + "). Setting " + MAX_FIELD_VALUE_CHARS + " to " + maxTotalChars + ".");
|
||||||
|
maxFieldValueChars = maxTotalChars;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
log.debug("LangId configured");
|
log.debug("LangId configured");
|
||||||
|
|
||||||
|
@ -203,11 +218,10 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
||||||
String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
|
String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
|
||||||
|
|
||||||
if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) {
|
if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) {
|
||||||
String allText = concatFields(doc, inputFields);
|
List<DetectedLanguage> languagelist = detectLanguage(doc);
|
||||||
List<DetectedLanguage> languagelist = detectLanguage(allText);
|
|
||||||
docLang = resolveLanguage(languagelist, fallbackLang);
|
docLang = resolveLanguage(languagelist, fallbackLang);
|
||||||
docLangs.add(docLang);
|
docLangs.add(docLang);
|
||||||
log.debug("Detected main document language from fields "+inputFields+": "+docLang);
|
log.debug("Detected main document language from fields "+ Arrays.toString(inputFields) +": "+docLang);
|
||||||
|
|
||||||
if(doc.containsKey(langField) && overwrite) {
|
if(doc.containsKey(langField) && overwrite) {
|
||||||
log.debug("Overwritten old value "+doc.getFieldValue(langField));
|
log.debug("Overwritten old value "+doc.getFieldValue(langField));
|
||||||
|
@ -227,8 +241,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
||||||
if(doc.containsKey(fieldName)) {
|
if(doc.containsKey(fieldName)) {
|
||||||
String fieldLang;
|
String fieldLang;
|
||||||
if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
|
if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
|
||||||
String text = (String) doc.getFieldValue(fieldName);
|
List<DetectedLanguage> languagelist = detectLanguage(doc);
|
||||||
List<DetectedLanguage> languagelist = detectLanguage(text);
|
|
||||||
fieldLang = resolveLanguage(languagelist, docLang);
|
fieldLang = resolveLanguage(languagelist, docLang);
|
||||||
docLangs.add(fieldLang);
|
docLangs.add(fieldLang);
|
||||||
log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
|
log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
|
||||||
|
@ -284,37 +297,13 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
||||||
return lang;
|
return lang;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Concatenates content from multiple fields
|
|
||||||
*/
|
|
||||||
protected String concatFields(SolrInputDocument doc, String[] fields) {
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
for (String fieldName : inputFields) {
|
|
||||||
log.debug("Appending field "+fieldName);
|
|
||||||
if (doc.containsKey(fieldName)) {
|
|
||||||
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
|
|
||||||
if (fieldValues != null) {
|
|
||||||
for (Object content : fieldValues) {
|
|
||||||
if (content instanceof String) {
|
|
||||||
sb.append((String) content);
|
|
||||||
sb.append(" ");
|
|
||||||
} else {
|
|
||||||
log.warn("Field " + fieldName + " not a String value, not including in detection");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Detects language(s) from a string.
|
* Detects language(s) from a string.
|
||||||
* Classes wishing to implement their own language detection module should override this method.
|
* Classes wishing to implement their own language detection module should override this method.
|
||||||
* @param content The content to identify
|
* @param content The content to identify
|
||||||
* @return List of detected language(s) according to RFC-3066
|
* @return List of detected language(s) according to RFC-3066
|
||||||
*/
|
*/
|
||||||
protected abstract List<DetectedLanguage> detectLanguage(String content);
|
protected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Chooses a language based on the list of candidates detected
|
* Chooses a language based on the list of candidates detected
|
||||||
|
|
|
@ -24,6 +24,9 @@ import org.apache.solr.request.SolrQueryRequest;
|
||||||
import org.apache.solr.response.SolrQueryResponse;
|
import org.apache.solr.response.SolrQueryResponse;
|
||||||
import org.apache.tika.language.LanguageIdentifier;
|
import org.apache.tika.language.LanguageIdentifier;
|
||||||
|
|
||||||
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Identifies the language of a set of input fields using Tika's
|
* Identifies the language of a set of input fields using Tika's
|
||||||
* LanguageIdentifier.
|
* LanguageIdentifier.
|
||||||
|
@ -40,9 +43,10 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<DetectedLanguage> detectLanguage(String content) {
|
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
|
||||||
List<DetectedLanguage> languages = new ArrayList<>();
|
List<DetectedLanguage> languages = new ArrayList<>();
|
||||||
if(content.trim().length() != 0) {
|
String content = concatFields(doc);
|
||||||
|
if (content.length() != 0) {
|
||||||
LanguageIdentifier identifier = new LanguageIdentifier(content);
|
LanguageIdentifier identifier = new LanguageIdentifier(content);
|
||||||
// FIXME: Hack - we get the distance from toString and calculate our own certainty score
|
// FIXME: Hack - we get the distance from toString and calculate our own certainty score
|
||||||
Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
|
Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
|
||||||
|
@ -57,4 +61,59 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd
|
||||||
}
|
}
|
||||||
return languages;
|
return languages;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Concatenates content from multiple fields
|
||||||
|
*/
|
||||||
|
protected String concatFields(SolrInputDocument doc) {
|
||||||
|
StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
|
||||||
|
for (String fieldName : inputFields) {
|
||||||
|
log.debug("Appending field " + fieldName);
|
||||||
|
if (doc.containsKey(fieldName)) {
|
||||||
|
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
|
||||||
|
if (fieldValues != null) {
|
||||||
|
for (Object content : fieldValues) {
|
||||||
|
if (content instanceof String) {
|
||||||
|
String stringContent = (String) content;
|
||||||
|
if (stringContent.length() > maxFieldValueChars) {
|
||||||
|
sb.append(stringContent.substring(0, maxFieldValueChars));
|
||||||
|
} else {
|
||||||
|
sb.append(stringContent);
|
||||||
|
}
|
||||||
|
sb.append(" ");
|
||||||
|
if (sb.length() > maxTotalChars) {
|
||||||
|
sb.setLength(maxTotalChars);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.warn("Field " + fieldName + " not a String value, not including in detection");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate expected string size.
|
||||||
|
*
|
||||||
|
* @param doc solr input document
|
||||||
|
* @param fields fields to select
|
||||||
|
* @return expected size of string value
|
||||||
|
*/
|
||||||
|
private int getExpectedSize(SolrInputDocument doc, String[] fields) {
|
||||||
|
int docSize = 0;
|
||||||
|
for (String field : fields) {
|
||||||
|
Collection<Object> contents = doc.getFieldValues(field);
|
||||||
|
for (Object content : contents) {
|
||||||
|
if (content instanceof String) {
|
||||||
|
docSize += Math.min(((String) content).length(), maxFieldValueChars);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
docSize = Math.min(docSize, maxTotalChars);
|
||||||
|
}
|
||||||
|
return docSize;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,11 +17,166 @@ package org.apache.solr.update.processor;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.solr.common.SolrInputDocument;
|
||||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
public class TikaLanguageIdentifierUpdateProcessorFactoryTest extends LanguageIdentifierUpdateProcessorFactoryTestCase {
|
public class TikaLanguageIdentifierUpdateProcessorFactoryTest extends LanguageIdentifierUpdateProcessorFactoryTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected LanguageIdentifierUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception {
|
protected LanguageIdentifierUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception {
|
||||||
return new TikaLanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(h.getCore(), parameters, null), resp, null);
|
return new TikaLanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(h.getCore(), parameters, null), resp, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMaxFieldValueChars() throws Exception {
|
||||||
|
SolrInputDocument doc = new SolrInputDocument();
|
||||||
|
String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
|
||||||
|
String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
|
||||||
|
doc.addField("foo_s", valueF1);
|
||||||
|
|
||||||
|
ModifiableSolrParams parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals(valueF1, p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
parameters.add("langid.maxFieldValueChars", "6");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals("Apache", p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
doc.addField("bar_s", valueF2);
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s,bar_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s,bar_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
parameters.add("langid.maxFieldValueChars", "6");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals("Apache" + " " + "An ope", p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s,bar_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
parameters.add("langid.maxFieldValueChars", "100000");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMaxTotalChars() throws Exception {
|
||||||
|
SolrInputDocument doc = new SolrInputDocument();
|
||||||
|
String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
|
||||||
|
String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
|
||||||
|
doc.addField("foo_s", valueF1);
|
||||||
|
|
||||||
|
ModifiableSolrParams parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals(valueF1, p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
parameters.add("langid.maxTotalChars", "6");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals("Apache", p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
doc.addField("bar_s", valueF2);
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s,bar_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s,bar_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
parameters.add("langid.maxTotalChars", "6");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals("Apache", p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s,bar_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
parameters.add("langid.maxTotalChars", "100000");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMaxFieldValueCharsAndMaxTotalChars() throws Exception {
|
||||||
|
SolrInputDocument doc = new SolrInputDocument();
|
||||||
|
String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
|
||||||
|
String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
|
||||||
|
doc.addField("foo_s", valueF1);
|
||||||
|
|
||||||
|
ModifiableSolrParams parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals(valueF1, p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
parameters.add("langid.maxFieldValueChars", "8");
|
||||||
|
parameters.add("langid.maxTotalChars", "6");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals("Apache", p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
doc.addField("bar_s", valueF2);
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s,bar_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s,bar_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
parameters.add("langid.maxFieldValueChars", "3");
|
||||||
|
parameters.add("langid.maxTotalChars", "8");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals("Apa An", p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "foo_s,bar_s");
|
||||||
|
parameters.add("langid.langField", "language");
|
||||||
|
parameters.add("langid.enforceSchema", "false");
|
||||||
|
parameters.add("langid.maxFieldValueChars", "10000");
|
||||||
|
parameters.add("langid.maxTotalChars", "100000");
|
||||||
|
p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
|
||||||
|
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue