SOLR-11774: langid.map.individual now works together with langid.map.keepOrig

This commit is contained in:
Jan Høydahl 2019-01-03 18:26:27 +01:00
parent 6342ec699e
commit 00f8f3a13a
8 changed files with 489 additions and 126 deletions

View File

@ -76,6 +76,9 @@ Upgrade Notes
This choice used to be toggleable with an internal/expert "anonChildDocs" parameter flag which is now gone.
(David Smiley)
* SOLR-11774: In 'langid' contrib, the LanguageIdentifierUpdateProcessor base class changed some method signatures.
If you have a custom language identifier implementation you will need to adapt your code.
New Features
----------------------
@ -100,6 +103,9 @@ Bug Fixes
* SOLR-13058: Fix block that was synchronizing on the wrong collection in OverseerTaskProcessor (Gus Heck)
* SOLR-11774: langid.map.individual now works together with langid.map.keepOrig. Also the detectLanguage() API
is changed to accept a Reader allowing for more memory efficient implementations (janhoy)
Improvements
----------------------

View File

@ -16,9 +16,10 @@
*/
package org.apache.solr.update.processor;
import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
@ -29,7 +30,6 @@ import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;
import org.apache.solr.common.SolrInputDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -48,33 +48,26 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
super(req, rsp, next);
}
/**
* Detects language(s) from a reader, typically based on some fields in SolrInputDocument
* Classes wishing to implement their own language detection module should override this method.
*
* @param solrDocReader A reader serving the text from the document to detect
* @return List of detected language(s) according to RFC-3066
*/
@Override
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
try {
Detector detector = DetectorFactory.create();
detector.setMaxTextLength(maxTotalChars);
for (String fieldName : inputFields) {
log.debug("Appending field " + fieldName);
if (doc.containsKey(fieldName)) {
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
if (fieldValues != null) {
for (Object content : fieldValues) {
if (content instanceof String) {
String stringContent = (String) content;
if (stringContent.length() > maxFieldValueChars) {
detector.append(stringContent.substring(0, maxFieldValueChars));
} else {
detector.append(stringContent);
}
detector.append(" ");
} else {
log.warn("Field " + fieldName + " not a String value, not including in detection");
}
}
}
}
// TODO Work around bug in LangDetect 1.1 which does not expect a -1 return value at end of stream,
// but instead only looks at ready()
if (solrDocReader instanceof SolrInputDocumentReader) {
((SolrInputDocumentReader)solrDocReader).setEodReturnValue(0);
}
detector.append(solrDocReader);
ArrayList<Language> langlist = detector.getProbabilities();
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
for (Language l: langlist) {
@ -84,6 +77,9 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
} catch (LangDetectException e) {
log.debug("Could not determine language, returning empty list: ", e);
return Collections.emptyList();
} catch (IOException e) {
log.warn("Could not determine language.", e);
return Collections.emptyList();
}
}
}

View File

@ -30,10 +30,10 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@ -41,11 +41,11 @@ import java.util.regex.Pattern;
/**
* Identifies the language of a set of input fields.
* Also supports mapping of field names based
* on detected language.
* <p>
* See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
* Identifies the language of a set of input fields.
* Also supports mapping of field names based on detected language.
* </p>
* See <a href="https://lucene.apache.org/solr/guide/7_4/detecting-languages-during-indexing.html">Detecting Languages During Indexing</a> in reference guide
* @since 3.5
* @lucene.experimental
*/
@ -207,11 +207,10 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
}
/**
* This is the main, testable process method called from processAdd()
* @param doc the SolrInputDocument to work on
* @return the modified SolrInputDocument
* This is the main process method called from processAdd()
* @param doc the SolrInputDocument to modify
*/
protected SolrInputDocument process(SolrInputDocument doc) {
protected void process(SolrInputDocument doc) {
String docLang = null;
HashSet<String> docLangs = new HashSet<>();
String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
@ -240,7 +239,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
if(doc.containsKey(fieldName)) {
String fieldLang;
if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
List<DetectedLanguage> languagelist = detectLanguage(doc);
List<DetectedLanguage> languagelist = detectLanguage(solrDocReader(doc, new String[]{fieldName}));
fieldLang = resolveLanguage(languagelist, docLang);
docLangs.add(fieldLang);
log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
@ -270,8 +269,6 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
if(langsField != null && langsField.length() != 0) {
doc.setField(langsField, docLangs.toArray());
}
return doc;
}
/**
@ -297,12 +294,21 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
}
/**
* Detects language(s) from a string.
* Classes wishing to implement their own language detection module should override this method.
* @param content The content to identify
* Detects language(s) from all configured fields
* @param doc The solr document
* @return List of detected language(s) according to RFC-3066
*/
protected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content);
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
return detectLanguage(solrDocReader(doc, inputFields));
}
/**
* Detects language(s) from a reader, typically based on some fields in SolrInputDocument
* Classes wishing to implement their own language detection module should override this method.
* @param solrDocReader A reader serving the text from the document to detect
* @return List of detected language(s) according to RFC-3066
*/
protected abstract List<DetectedLanguage> detectLanguage(Reader solrDocReader);
/**
* Chooses a language based on the list of candidates detected
@ -400,67 +406,22 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
this.enabled = enabled;
}
/**
* Concatenates content from multiple fields
* Returns a reader that streams String content from fields.
* This is more memory efficient than building a full string buffer
* @param doc the solr document
* @param fields the field names to read
* @return a reader over the fields
*/
protected SolrInputDocumentReader solrDocReader(SolrInputDocument doc, String[] fields) {
return new SolrInputDocumentReader(doc, fields, maxTotalChars, maxFieldValueChars, " ");
}
/**
* Concatenates content from input fields defined in langid.fl.
* For test purposes only
*/
protected String concatFields(SolrInputDocument doc) {
StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
for (String fieldName : inputFields) {
log.debug("Appending field " + fieldName);
if (doc.containsKey(fieldName)) {
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
if (fieldValues != null) {
for (Object content : fieldValues) {
if (content instanceof String) {
String stringContent = (String) content;
if (stringContent.length() > maxFieldValueChars) {
sb.append(stringContent.substring(0, maxFieldValueChars));
} else {
sb.append(stringContent);
}
sb.append(" ");
if (sb.length() > maxTotalChars) {
sb.setLength(maxTotalChars);
break;
}
} else {
log.warn("Field " + fieldName + " not a String value, not including in detection");
}
}
}
}
}
return sb.toString();
}
/**
* Calculate expected string size.
*
* @param doc solr input document
* @param fields fields to select
* @return expected size of string value
*/
private int getExpectedSize(SolrInputDocument doc, String[] fields) {
int docSize = 0;
for (String field : fields) {
if (doc.containsKey(field)) {
Collection<Object> contents = doc.getFieldValues(field);
if (contents != null) {
for (Object content : contents) {
if (content instanceof String) {
docSize += Math.min(((String) content).length(), maxFieldValueChars);
}
}
if (docSize > maxTotalChars) {
docSize = maxTotalChars;
break;
}
}
}
}
return docSize;
return SolrInputDocumentReader.asString(solrDocReader(doc, inputFields));
}
}

View File

@ -16,6 +16,7 @@
*/
package org.apache.solr.update.processor;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
@ -23,7 +24,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.slf4j.Logger;
@ -54,9 +54,9 @@ public class OpenNLPLangDetectUpdateProcessor extends LanguageIdentifierUpdatePr
}
@Override
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
List<DetectedLanguage> languages = new ArrayList<>();
String content = concatFields(doc);
String content = SolrInputDocumentReader.asString(solrDocReader);
if (content.length() != 0) {
LanguageDetectorME ldme = new LanguageDetectorME(model);
Language[] langs = ldme.predictLanguages(content);

View File

@ -0,0 +1,224 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Reader on top of SolrInputDocument that can "stream" a document as a character stream in a memory
* efficient way, to avoid potentially large intermediate string buffers containing whole document content.
* @lucene.experimental
*/
public class SolrInputDocumentReader extends Reader {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private SolrInputDocument doc;
private final String[] fields;
private final String fieldValueSep;
private final int maxTotalChars;
private final int maxCharsPerFieldValue;
private int totalCharsConsumed;
// Remember where we are at
private int currentFieldIdx = 0;
private int currentFieldValueIdx = 0;
private int currentFieldValueOffset = 0;
private boolean eod = false;
// Normally a Reader will return -1 at end of document, but to work around LangDetect's bug, we allow another value
private int eodReturnValue = -1;
/**
* Creates a character-stream reader that streams all String fields in the document with space as separator
*
* @param doc Solr input document
* @param maxCharsPerFieldValue max chars to consume per field value
* @param maxTotalChars max chars to consume total
*/
public SolrInputDocumentReader(SolrInputDocument doc, int maxTotalChars, int maxCharsPerFieldValue) {
this(doc, getStringFields(doc), maxTotalChars, maxCharsPerFieldValue, " ");
}
/**
* Creates a character-stream reader that reads the listed fields in order, with
* max lengths as specified.
*
* @param doc Solr input document
* @param fields list of field names to include
* @param fieldValueSep separator to insert between field values
* @param maxCharsPerFieldValue max chars to consume per field value
* @param maxTotalChars max chars to consume total
*/
public SolrInputDocumentReader(SolrInputDocument doc, String[] fields, int maxTotalChars,
int maxCharsPerFieldValue, String fieldValueSep) {
this.doc = doc;
this.fields = fields;
this.fieldValueSep = fieldValueSep;
if (fields == null || fields.length == 0) throw new IllegalArgumentException("fields cannot be empty");
this.maxTotalChars = maxTotalChars;
this.maxCharsPerFieldValue = maxCharsPerFieldValue;
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
StringBuilder sb = new StringBuilder(len);
int numChars = fillBuffer(sb, len);
if (numChars > -1) {
sb.getChars(0, numChars, cbuf, off);
}
totalCharsConsumed += numChars;
return numChars;
}
private int fillBuffer(StringBuilder sb, int targetLen) {
if (eod) return eodReturnValue;
if (totalCharsConsumed + targetLen > maxTotalChars) {
targetLen = maxTotalChars - totalCharsConsumed;
}
while (sb.length() < targetLen && !eod) {
nextDocChunk(sb, targetLen);
}
if (sb.length() == 0) {
eod = true;
return eodReturnValue;
} else {
return sb.length();
}
}
private int nextDocChunk(StringBuilder sb, int maxChunkLength) {
if (currentFieldIdx > fields.length-1) {
return returnEod();
}
int startFieldValueIdx = currentFieldValueIdx;
int startFieldValueOffset = currentFieldValueOffset;
do {
SolrInputField f = doc.getField(fields[currentFieldIdx]);
if (f == null) {
log.debug("Field with name {} did not exist on docuemnt.", fields[currentFieldIdx]);
incField(sb);
continue;
}
Iterator<Object> fvIt = f.iterator();
currentFieldValueIdx = -1;
while (fvIt.hasNext() && sb.length() < maxChunkLength) {
currentFieldValueIdx++;
String fvStr = String.valueOf(fvIt.next());
if (currentFieldValueIdx < startFieldValueIdx) continue;
startFieldValueIdx = 0;
if (sb.length() > 0) {
if (maxChunkLength - sb.length() < fieldValueSep.length()) {
sb.append(fieldValueSep.substring(0,maxChunkLength - sb.length()));
} else {
sb.append(fieldValueSep);
}
}
currentFieldValueOffset = startFieldValueOffset;
startFieldValueOffset = 0;
int charsNeeded = maxChunkLength - sb.length();
int endOffset = fvStr.length();
if (fvStr.length() - currentFieldValueOffset > charsNeeded) {
endOffset = currentFieldValueOffset + charsNeeded;
}
if (endOffset - currentFieldValueOffset > maxCharsPerFieldValue) {
endOffset = maxCharsPerFieldValue - currentFieldValueOffset;
}
sb.append(fvStr.substring(currentFieldValueOffset, endOffset));
currentFieldValueOffset = endOffset == fvStr.length() ? 0 : endOffset;
}
if (sb.length() >= maxChunkLength) {
return returnValue(sb);
} else {
incField(sb);
}
} while (currentFieldIdx <= fields.length-1 && sb.length() < maxChunkLength);
return sb.length() == 0 ? eodReturnValue : sb.length();
}
private int returnEod() {
eod = true;
return eodReturnValue;
}
private int returnValue(StringBuilder sb) {
if (sb.length() == 0) {
return returnEod();
} else {
return sb.length();
}
}
private void incField(StringBuilder sb) {
currentFieldIdx++;
currentFieldValueIdx = 0;
currentFieldValueOffset = 0;
}
@Override
public void close() throws IOException { /* ignored */ }
@Override
public boolean ready() throws IOException {
return !eod;
}
/**
* Choose another return value than -1 for end of document reached.
* <b>Warning: Only to work around buggy consumers such as LangDetect 1.1</b>
* @param eodReturnValue integer which defaults to -1
*/
public void setEodReturnValue(int eodReturnValue) {
this.eodReturnValue = eodReturnValue;
}
/**
* Gets the whole reader as a String
* @return string of concatenated fields
*/
public static String asString(Reader reader) {
try {
return IOUtils.toString(reader);
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Failed reading doc content from reader", e);
}
}
protected static String[] getStringFields(SolrInputDocument doc) {
Iterable<SolrInputField> iterable = () -> doc.iterator();
List<String> strFields = StreamSupport.stream(iterable.spliterator(), false)
.filter(f -> f.getFirstValue() instanceof String)
.map(SolrInputField::getName).collect(Collectors.toList());
return strFields.toArray(new String[0]);
}
}

View File

@ -16,6 +16,7 @@
*/
package org.apache.solr.update.processor;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.List;
@ -23,8 +24,6 @@ import java.util.List;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.solr.common.SolrInputDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -44,11 +43,11 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd
SolrQueryResponse rsp, UpdateRequestProcessor next) {
super(req, rsp, next);
}
@Override
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
String content = SolrInputDocumentReader.asString(solrDocReader);
List<DetectedLanguage> languages = new ArrayList<>();
String content = concatFields(doc);
if (content.length() != 0) {
LanguageIdentifier identifier = new LanguageIdentifier(content);
// FIXME: Hack - we get the distance from toString and calculate our own certainty score

View File

@ -140,14 +140,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
liProcessor = createLangIdProcessor(parameters);
doc = englishDoc();
assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
assertEquals("en", process(doc).getFieldValue("language"));
assertEquals("en", process(doc).getFieldValue("languages"));
doc = englishDoc();
doc.setField("language", "no");
assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
assertNotNull(liProcessor.process(doc).getFieldValue("text_no"));
assertEquals("no", process(doc).getFieldValue("language"));
assertEquals("no", process(doc).getFieldValue("languages"));
assertNotNull(process(doc).getFieldValue("text_no"));
}
/**
@ -166,14 +166,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
liProcessor = createLangIdProcessor(parameters);
doc = englishDoc();
assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
assertEquals("en", process(doc).getFieldValue("language"));
assertEquals("en", process(doc).getFieldValue("languages"));
doc = englishDoc();
doc.setField("language", "no");
assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
assertNotNull(liProcessor.process(doc).getFieldValue("text_multivalue_no"));
assertEquals("no", process(doc).getFieldValue("language"));
assertEquals("no", process(doc).getFieldValue("languages"));
assertNotNull(process(doc).getFieldValue("text_multivalue_no"));
}
/**
@ -192,14 +192,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
liProcessor = createLangIdProcessor(parameters);
doc = mixedEnglishRussianDoc();
assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
assertEquals("en", process(doc).getFieldValue("language"));
assertEquals("en", process(doc).getFieldValue("languages"));
doc = mixedEnglishRussianDoc();
doc.setField("language", "no");
assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
assertNotNull(liProcessor.process(doc).getFieldValue("text_multivalue_no"));
assertEquals("no", process(doc).getFieldValue("language"));
assertEquals("no", process(doc).getFieldValue("languages"));
assertNotNull(process(doc).getFieldValue("text_multivalue_no"));
}
@Test
@ -212,7 +212,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
liProcessor = createLangIdProcessor(parameters);
doc = tooShortDoc();
assertEquals("", liProcessor.process(doc).getFieldValue("language"));
assertEquals("", process(doc).getFieldValue("language"));
}
@Test
@ -225,7 +225,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
liProcessor = createLangIdProcessor(parameters);
doc = new SolrInputDocument();
assertEquals("", liProcessor.process(doc).getFieldValue("language"));
assertEquals("", process(doc).getFieldValue("language"));
}
@Test
@ -242,11 +242,11 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
// Verify fallback to field fb (noop field does not exist and is skipped)
doc = tooShortDoc();
doc.addField("fb", "fbField");
assertEquals("fbField", liProcessor.process(doc).getFieldValue("language"));
assertEquals("fbField", process(doc).getFieldValue("language"));
// Verify fallback to fallback value since no fallback fields exist
doc = tooShortDoc();
assertEquals("fbVal", liProcessor.process(doc).getFieldValue("language"));
assertEquals("fbVal", process(doc).getFieldValue("language"));
}
@Test
@ -272,6 +272,60 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
assertEquals("fallback", liProcessor.resolveLanguage(langs, "fallback"));
}
@Test
public void testKeepOrig() throws Exception {
ModifiableSolrParams parameters = new ModifiableSolrParams();
parameters.set("langid.enforceSchema", "false");
parameters.set("langid.langField", "language");
parameters.set("langid.langsField", "languages");
parameters.set("langid.fl", "text");
parameters.set("langid.map", "true");
parameters.set("langid.map.keepOrig", "false");
liProcessor = createLangIdProcessor(parameters);
SolrInputDocument mappedNoOrig = process(englishDoc());
assertEquals("text_en", liProcessor.getMappedField("text", "en"));
assertEquals("en", mappedNoOrig.getFieldValue("language"));
assertTrue(mappedNoOrig.containsKey("text_en"));
assertFalse(mappedNoOrig.containsKey("text"));
// keepOrig true
parameters.set("langid.map.keepOrig", "true");
liProcessor = createLangIdProcessor(parameters);
SolrInputDocument mappedKeepOrig = process(englishDoc());
assertTrue(mappedKeepOrig.containsKey("text_en"));
assertTrue(mappedKeepOrig.containsKey("text"));
assertEquals(englishDoc().getFieldValue("text"), mappedKeepOrig.getFieldValue("text_en"));
// keepOrig and map individual
parameters.set("langid.map.individual", "true");
parameters.set("langid.fl", "text,text2");
liProcessor = createLangIdProcessor(parameters);
SolrInputDocument mappedIndividual = process(languagePerFieldDoc());
assertTrue(mappedIndividual.containsKey("text_en"));
assertTrue(mappedIndividual.containsKey("text"));
assertTrue(mappedIndividual.containsKey("text2_ru"));
assertTrue(mappedIndividual.containsKey("text2"));
assertEquals(languagePerFieldDoc().getFieldValue("text"), mappedIndividual.getFieldValue("text_en"));
}
@Test
public void testMapIndividual() throws Exception {
ModifiableSolrParams parameters = new ModifiableSolrParams();
parameters.set("langid.enforceSchema", "false");
parameters.set("langid.langField", "language");
parameters.set("langid.langsField", "languages");
parameters.set("langid.fl", "text,text2");
parameters.set("langid.map", "true");
parameters.set("langid.map.individual", "true");
liProcessor = createLangIdProcessor(parameters);
SolrInputDocument mappedIndividual = process(languagePerFieldDoc());
assertTrue(mappedIndividual.containsKey("text_en"));
assertTrue(mappedIndividual.containsKey("text2_ru"));
}
// Various utility methods
@ -282,6 +336,12 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
return doc;
}
private SolrInputDocument languagePerFieldDoc() {
SolrInputDocument doc = englishDoc();
doc.addField("text2", "The Apache Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска, написанная на Java. Может быть использована для поиска в интернете и других областях компьютерной лингвистики (аналитическая философия).");
return doc;
}
/**
* Construct document containing multi-value fields in different languages.
* @return solr input document
@ -307,7 +367,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
if(liProcessor == null)
throw new Exception("Processor must be initialized before calling assertLang()");
SolrInputDocument doc = sid(fieldsAndValues);
assertEquals(langCode, liProcessor.process(doc).getFieldValue(liProcessor.langField));
assertEquals(langCode, process(doc).getFieldValue(liProcessor.langField));
}
private SolrInputDocument sid(String... fieldsAndValues) {
@ -317,4 +377,13 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
}
return doc;
}
/*
Utility test method to process a clone of a document
*/
private SolrInputDocument process(SolrInputDocument origDoc) {
SolrInputDocument modifiedDoc = origDoc.deepCopy();
liProcessor.process(modifiedDoc);
return modifiedDoc;
}
}

View File

@ -0,0 +1,108 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.util.Arrays;
import org.apache.solr.common.SolrInputDocument;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class SolrInputDocumentReaderTest {
private SolrInputDocument doc;
private String[] allFields;
@Before
public void setUp() throws Exception {
doc = new SolrInputDocument();
doc.addField("f1", "a b c");
doc.addField("f2", "multi");
doc.addField("f2", "valued");
doc.addField("f2", "field");
doc.addField("f3", 123);
doc.addField("f4", "12345678901234567890");
allFields = new String[] {"f1", "f2", "f3", "f4"};
}
@Test
public void readChunked() throws Exception {
SolrInputDocumentReader reader = new SolrInputDocumentReader(
doc,
allFields,
20,
18,
" - ");
assertTrue(reader.ready());
char[] chars = new char[1000];
int len;
assertEquals(9, len=reader.read(chars, 0, 9));
assertArrEqu("a b c - m", chars, len);
len += reader.read(chars, 9, 2);
assertArrEqu("a b c - mul", chars, len);
len += reader.read(chars, 11, 1);
assertArrEqu("a b c - mult", chars, len);
len += reader.read(chars, 12, 10);
// We now hit totalMaxChars
assertArrEqu("a b c - multi - valu", chars, len);
}
@Test
public void maxFieldValueLength() throws Exception {
SolrInputDocumentReader reader = new SolrInputDocumentReader(
doc,
allFields,
21,
2,
" - "
);
assertTrue(reader.ready());
char[] chars = new char[1000];
int len = reader.read(chars, 0, 22);
assertEquals(21, len);
assertArrEqu("a - mu - va - fi - 1", chars, len);
}
@Test
public void allStrFields() throws Exception {
SolrInputDocumentReader reader = new SolrInputDocumentReader(
doc,
20000,
10000
);
assertTrue(reader.ready());
char[] chars = new char[1000];
int len = reader.read(chars, 0, 1000);
assertEquals(45, len);
assertArrEqu("a b c multi valued field 12345678901234567890", chars, len);
}
@Test
public void testGetStringFields() throws Exception {
String[] expected = new String[] {"f1", "f2", "f4"};
assertArrayEquals(expected, SolrInputDocumentReader.getStringFields(doc));
}
private void assertArrEqu(String expected, char[] chars, int len) {
String str = new String(Arrays.copyOf(chars, len));
assertEquals(expected, str);
}
}