mirror of https://github.com/apache/lucene.git
SOLR-11774: langid.map.individual now works together with langid.map.keepOrig
This commit is contained in:
parent
6342ec699e
commit
00f8f3a13a
|
@ -76,6 +76,9 @@ Upgrade Notes
|
|||
This choice used to be toggleable with an internal/expert "anonChildDocs" parameter flag which is now gone.
|
||||
(David Smiley)
|
||||
|
||||
* SOLR-11774: In 'langid' contrib, the LanguageIdentifierUpdateProcessor base class changed some method signatures.
|
||||
If you have a custom language identifier implementation you will need to adapt your code.
|
||||
|
||||
New Features
|
||||
----------------------
|
||||
|
||||
|
@ -100,6 +103,9 @@ Bug Fixes
|
|||
|
||||
* SOLR-13058: Fix block that was synchronizing on the wrong collection in OverseerTaskProcessor (Gus Heck)
|
||||
|
||||
* SOLR-11774: langid.map.individual now works together with langid.map.keepOrig. Also the detectLanguage() API
|
||||
is changed to accept a Reader allowing for more memory efficient implementations (janhoy)
|
||||
|
||||
Improvements
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -16,9 +16,10 @@
|
|||
*/
|
||||
package org.apache.solr.update.processor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -29,7 +30,6 @@ import com.cybozu.labs.langdetect.Detector;
|
|||
import com.cybozu.labs.langdetect.DetectorFactory;
|
||||
import com.cybozu.labs.langdetect.LangDetectException;
|
||||
import com.cybozu.labs.langdetect.Language;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -48,33 +48,26 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
|
|||
super(req, rsp, next);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects language(s) from a reader, typically based on some fields in SolrInputDocument
|
||||
* Classes wishing to implement their own language detection module should override this method.
|
||||
*
|
||||
* @param solrDocReader A reader serving the text from the document to detect
|
||||
* @return List of detected language(s) according to RFC-3066
|
||||
*/
|
||||
@Override
|
||||
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
|
||||
protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
|
||||
try {
|
||||
Detector detector = DetectorFactory.create();
|
||||
detector.setMaxTextLength(maxTotalChars);
|
||||
|
||||
for (String fieldName : inputFields) {
|
||||
log.debug("Appending field " + fieldName);
|
||||
if (doc.containsKey(fieldName)) {
|
||||
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
|
||||
if (fieldValues != null) {
|
||||
for (Object content : fieldValues) {
|
||||
if (content instanceof String) {
|
||||
String stringContent = (String) content;
|
||||
if (stringContent.length() > maxFieldValueChars) {
|
||||
detector.append(stringContent.substring(0, maxFieldValueChars));
|
||||
} else {
|
||||
detector.append(stringContent);
|
||||
}
|
||||
detector.append(" ");
|
||||
} else {
|
||||
log.warn("Field " + fieldName + " not a String value, not including in detection");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO Work around bug in LangDetect 1.1 which does not expect a -1 return value at end of stream,
|
||||
// but instead only looks at ready()
|
||||
if (solrDocReader instanceof SolrInputDocumentReader) {
|
||||
((SolrInputDocumentReader)solrDocReader).setEodReturnValue(0);
|
||||
}
|
||||
detector.append(solrDocReader);
|
||||
|
||||
ArrayList<Language> langlist = detector.getProbabilities();
|
||||
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
|
||||
for (Language l: langlist) {
|
||||
|
@ -84,6 +77,9 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
|
|||
} catch (LangDetectException e) {
|
||||
log.debug("Could not determine language, returning empty list: ", e);
|
||||
return Collections.emptyList();
|
||||
} catch (IOException e) {
|
||||
log.warn("Could not determine language.", e);
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,10 +30,10 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
@ -41,11 +41,11 @@ import java.util.regex.Pattern;
|
|||
|
||||
|
||||
/**
|
||||
* Identifies the language of a set of input fields.
|
||||
* Also supports mapping of field names based
|
||||
* on detected language.
|
||||
* <p>
|
||||
* See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
|
||||
* Identifies the language of a set of input fields.
|
||||
* Also supports mapping of field names based on detected language.
|
||||
* </p>
|
||||
* See <a href="https://lucene.apache.org/solr/guide/7_4/detecting-languages-during-indexing.html">Detecting Languages During Indexing</a> in reference guide
|
||||
* @since 3.5
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
@ -207,11 +207,10 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
|||
}
|
||||
|
||||
/**
|
||||
* This is the main, testable process method called from processAdd()
|
||||
* @param doc the SolrInputDocument to work on
|
||||
* @return the modified SolrInputDocument
|
||||
* This is the main process method called from processAdd()
|
||||
* @param doc the SolrInputDocument to modify
|
||||
*/
|
||||
protected SolrInputDocument process(SolrInputDocument doc) {
|
||||
protected void process(SolrInputDocument doc) {
|
||||
String docLang = null;
|
||||
HashSet<String> docLangs = new HashSet<>();
|
||||
String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
|
||||
|
@ -240,7 +239,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
|||
if(doc.containsKey(fieldName)) {
|
||||
String fieldLang;
|
||||
if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
|
||||
List<DetectedLanguage> languagelist = detectLanguage(doc);
|
||||
List<DetectedLanguage> languagelist = detectLanguage(solrDocReader(doc, new String[]{fieldName}));
|
||||
fieldLang = resolveLanguage(languagelist, docLang);
|
||||
docLangs.add(fieldLang);
|
||||
log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
|
||||
|
@ -270,8 +269,6 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
|||
if(langsField != null && langsField.length() != 0) {
|
||||
doc.setField(langsField, docLangs.toArray());
|
||||
}
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -297,12 +294,21 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
|||
}
|
||||
|
||||
/**
|
||||
* Detects language(s) from a string.
|
||||
* Classes wishing to implement their own language detection module should override this method.
|
||||
* @param content The content to identify
|
||||
* Detects language(s) from all configured fields
|
||||
* @param doc The solr document
|
||||
* @return List of detected language(s) according to RFC-3066
|
||||
*/
|
||||
protected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content);
|
||||
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
|
||||
return detectLanguage(solrDocReader(doc, inputFields));
|
||||
}
|
||||
|
||||
/**
|
||||
* Detects language(s) from a reader, typically based on some fields in SolrInputDocument
|
||||
* Classes wishing to implement their own language detection module should override this method.
|
||||
* @param solrDocReader A reader serving the text from the document to detect
|
||||
* @return List of detected language(s) according to RFC-3066
|
||||
*/
|
||||
protected abstract List<DetectedLanguage> detectLanguage(Reader solrDocReader);
|
||||
|
||||
/**
|
||||
* Chooses a language based on the list of candidates detected
|
||||
|
@ -400,67 +406,22 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
|||
this.enabled = enabled;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Concatenates content from multiple fields
|
||||
* Returns a reader that streams String content from fields.
|
||||
* This is more memory efficient than building a full string buffer
|
||||
* @param doc the solr document
|
||||
* @param fields the field names to read
|
||||
* @return a reader over the fields
|
||||
*/
|
||||
protected SolrInputDocumentReader solrDocReader(SolrInputDocument doc, String[] fields) {
|
||||
return new SolrInputDocumentReader(doc, fields, maxTotalChars, maxFieldValueChars, " ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Concatenates content from input fields defined in langid.fl.
|
||||
* For test purposes only
|
||||
*/
|
||||
protected String concatFields(SolrInputDocument doc) {
|
||||
StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
|
||||
for (String fieldName : inputFields) {
|
||||
log.debug("Appending field " + fieldName);
|
||||
if (doc.containsKey(fieldName)) {
|
||||
Collection<Object> fieldValues = doc.getFieldValues(fieldName);
|
||||
if (fieldValues != null) {
|
||||
for (Object content : fieldValues) {
|
||||
if (content instanceof String) {
|
||||
String stringContent = (String) content;
|
||||
if (stringContent.length() > maxFieldValueChars) {
|
||||
sb.append(stringContent.substring(0, maxFieldValueChars));
|
||||
} else {
|
||||
sb.append(stringContent);
|
||||
}
|
||||
sb.append(" ");
|
||||
if (sb.length() > maxTotalChars) {
|
||||
sb.setLength(maxTotalChars);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
log.warn("Field " + fieldName + " not a String value, not including in detection");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate expected string size.
|
||||
*
|
||||
* @param doc solr input document
|
||||
* @param fields fields to select
|
||||
* @return expected size of string value
|
||||
*/
|
||||
private int getExpectedSize(SolrInputDocument doc, String[] fields) {
|
||||
int docSize = 0;
|
||||
for (String field : fields) {
|
||||
if (doc.containsKey(field)) {
|
||||
Collection<Object> contents = doc.getFieldValues(field);
|
||||
if (contents != null) {
|
||||
for (Object content : contents) {
|
||||
if (content instanceof String) {
|
||||
docSize += Math.min(((String) content).length(), maxFieldValueChars);
|
||||
}
|
||||
}
|
||||
|
||||
if (docSize > maxTotalChars) {
|
||||
docSize = maxTotalChars;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return docSize;
|
||||
return SolrInputDocumentReader.asString(solrDocReader(doc, inputFields));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
*/
|
||||
package org.apache.solr.update.processor;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
@ -23,7 +24,6 @@ import java.util.List;
|
|||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -54,9 +54,9 @@ public class OpenNLPLangDetectUpdateProcessor extends LanguageIdentifierUpdatePr
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
|
||||
protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
|
||||
List<DetectedLanguage> languages = new ArrayList<>();
|
||||
String content = concatFields(doc);
|
||||
String content = SolrInputDocumentReader.asString(solrDocReader);
|
||||
if (content.length() != 0) {
|
||||
LanguageDetectorME ldme = new LanguageDetectorME(model);
|
||||
Language[] langs = ldme.predictLanguages(content);
|
||||
|
|
|
@ -0,0 +1,224 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.update.processor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.SolrInputField;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Reader on top of SolrInputDocument that can "stream" a document as a character stream in a memory
|
||||
* efficient way, to avoid potentially large intermediate string buffers containing whole document content.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SolrInputDocumentReader extends Reader {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
private SolrInputDocument doc;
|
||||
private final String[] fields;
|
||||
private final String fieldValueSep;
|
||||
private final int maxTotalChars;
|
||||
private final int maxCharsPerFieldValue;
|
||||
private int totalCharsConsumed;
|
||||
|
||||
// Remember where we are at
|
||||
private int currentFieldIdx = 0;
|
||||
private int currentFieldValueIdx = 0;
|
||||
private int currentFieldValueOffset = 0;
|
||||
private boolean eod = false;
|
||||
// Normally a Reader will return -1 at end of document, but to work around LangDetect's bug, we allow another value
|
||||
private int eodReturnValue = -1;
|
||||
|
||||
/**
|
||||
* Creates a character-stream reader that streams all String fields in the document with space as separator
|
||||
*
|
||||
* @param doc Solr input document
|
||||
* @param maxCharsPerFieldValue max chars to consume per field value
|
||||
* @param maxTotalChars max chars to consume total
|
||||
*/
|
||||
public SolrInputDocumentReader(SolrInputDocument doc, int maxTotalChars, int maxCharsPerFieldValue) {
|
||||
this(doc, getStringFields(doc), maxTotalChars, maxCharsPerFieldValue, " ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a character-stream reader that reads the listed fields in order, with
|
||||
* max lengths as specified.
|
||||
*
|
||||
* @param doc Solr input document
|
||||
* @param fields list of field names to include
|
||||
* @param fieldValueSep separator to insert between field values
|
||||
* @param maxCharsPerFieldValue max chars to consume per field value
|
||||
* @param maxTotalChars max chars to consume total
|
||||
*/
|
||||
public SolrInputDocumentReader(SolrInputDocument doc, String[] fields, int maxTotalChars,
|
||||
int maxCharsPerFieldValue, String fieldValueSep) {
|
||||
this.doc = doc;
|
||||
this.fields = fields;
|
||||
this.fieldValueSep = fieldValueSep;
|
||||
if (fields == null || fields.length == 0) throw new IllegalArgumentException("fields cannot be empty");
|
||||
this.maxTotalChars = maxTotalChars;
|
||||
this.maxCharsPerFieldValue = maxCharsPerFieldValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
StringBuilder sb = new StringBuilder(len);
|
||||
int numChars = fillBuffer(sb, len);
|
||||
|
||||
if (numChars > -1) {
|
||||
sb.getChars(0, numChars, cbuf, off);
|
||||
}
|
||||
totalCharsConsumed += numChars;
|
||||
return numChars;
|
||||
}
|
||||
|
||||
private int fillBuffer(StringBuilder sb, int targetLen) {
|
||||
if (eod) return eodReturnValue;
|
||||
if (totalCharsConsumed + targetLen > maxTotalChars) {
|
||||
targetLen = maxTotalChars - totalCharsConsumed;
|
||||
}
|
||||
|
||||
while (sb.length() < targetLen && !eod) {
|
||||
nextDocChunk(sb, targetLen);
|
||||
}
|
||||
|
||||
if (sb.length() == 0) {
|
||||
eod = true;
|
||||
return eodReturnValue;
|
||||
} else {
|
||||
return sb.length();
|
||||
}
|
||||
}
|
||||
|
||||
private int nextDocChunk(StringBuilder sb, int maxChunkLength) {
|
||||
if (currentFieldIdx > fields.length-1) {
|
||||
return returnEod();
|
||||
}
|
||||
|
||||
int startFieldValueIdx = currentFieldValueIdx;
|
||||
int startFieldValueOffset = currentFieldValueOffset;
|
||||
|
||||
do {
|
||||
SolrInputField f = doc.getField(fields[currentFieldIdx]);
|
||||
if (f == null) {
|
||||
log.debug("Field with name {} did not exist on docuemnt.", fields[currentFieldIdx]);
|
||||
incField(sb);
|
||||
continue;
|
||||
}
|
||||
Iterator<Object> fvIt = f.iterator();
|
||||
currentFieldValueIdx = -1;
|
||||
while (fvIt.hasNext() && sb.length() < maxChunkLength) {
|
||||
currentFieldValueIdx++;
|
||||
String fvStr = String.valueOf(fvIt.next());
|
||||
if (currentFieldValueIdx < startFieldValueIdx) continue;
|
||||
startFieldValueIdx = 0;
|
||||
if (sb.length() > 0) {
|
||||
if (maxChunkLength - sb.length() < fieldValueSep.length()) {
|
||||
sb.append(fieldValueSep.substring(0,maxChunkLength - sb.length()));
|
||||
} else {
|
||||
sb.append(fieldValueSep);
|
||||
}
|
||||
}
|
||||
currentFieldValueOffset = startFieldValueOffset;
|
||||
startFieldValueOffset = 0;
|
||||
int charsNeeded = maxChunkLength - sb.length();
|
||||
int endOffset = fvStr.length();
|
||||
if (fvStr.length() - currentFieldValueOffset > charsNeeded) {
|
||||
endOffset = currentFieldValueOffset + charsNeeded;
|
||||
}
|
||||
if (endOffset - currentFieldValueOffset > maxCharsPerFieldValue) {
|
||||
endOffset = maxCharsPerFieldValue - currentFieldValueOffset;
|
||||
}
|
||||
sb.append(fvStr.substring(currentFieldValueOffset, endOffset));
|
||||
currentFieldValueOffset = endOffset == fvStr.length() ? 0 : endOffset;
|
||||
}
|
||||
if (sb.length() >= maxChunkLength) {
|
||||
return returnValue(sb);
|
||||
} else {
|
||||
incField(sb);
|
||||
}
|
||||
} while (currentFieldIdx <= fields.length-1 && sb.length() < maxChunkLength);
|
||||
return sb.length() == 0 ? eodReturnValue : sb.length();
|
||||
}
|
||||
|
||||
private int returnEod() {
|
||||
eod = true;
|
||||
return eodReturnValue;
|
||||
}
|
||||
|
||||
private int returnValue(StringBuilder sb) {
|
||||
if (sb.length() == 0) {
|
||||
return returnEod();
|
||||
} else {
|
||||
return sb.length();
|
||||
}
|
||||
}
|
||||
|
||||
private void incField(StringBuilder sb) {
|
||||
currentFieldIdx++;
|
||||
currentFieldValueIdx = 0;
|
||||
currentFieldValueOffset = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException { /* ignored */ }
|
||||
|
||||
@Override
|
||||
public boolean ready() throws IOException {
|
||||
return !eod;
|
||||
}
|
||||
|
||||
/**
|
||||
* Choose another return value than -1 for end of document reached.
|
||||
* <b>Warning: Only to work around buggy consumers such as LangDetect 1.1</b>
|
||||
* @param eodReturnValue integer which defaults to -1
|
||||
*/
|
||||
public void setEodReturnValue(int eodReturnValue) {
|
||||
this.eodReturnValue = eodReturnValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the whole reader as a String
|
||||
* @return string of concatenated fields
|
||||
*/
|
||||
public static String asString(Reader reader) {
|
||||
try {
|
||||
return IOUtils.toString(reader);
|
||||
} catch (IOException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Failed reading doc content from reader", e);
|
||||
}
|
||||
}
|
||||
|
||||
protected static String[] getStringFields(SolrInputDocument doc) {
|
||||
Iterable<SolrInputField> iterable = () -> doc.iterator();
|
||||
List<String> strFields = StreamSupport.stream(iterable.spliterator(), false)
|
||||
.filter(f -> f.getFirstValue() instanceof String)
|
||||
.map(SolrInputField::getName).collect(Collectors.toList());
|
||||
return strFields.toArray(new String[0]);
|
||||
}
|
||||
}
|
|
@ -16,6 +16,7 @@
|
|||
*/
|
||||
package org.apache.solr.update.processor;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -23,8 +24,6 @@ import java.util.List;
|
|||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.tika.language.LanguageIdentifier;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -44,11 +43,11 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd
|
|||
SolrQueryResponse rsp, UpdateRequestProcessor next) {
|
||||
super(req, rsp, next);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
|
||||
protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
|
||||
String content = SolrInputDocumentReader.asString(solrDocReader);
|
||||
List<DetectedLanguage> languages = new ArrayList<>();
|
||||
String content = concatFields(doc);
|
||||
if (content.length() != 0) {
|
||||
LanguageIdentifier identifier = new LanguageIdentifier(content);
|
||||
// FIXME: Hack - we get the distance from toString and calculate our own certainty score
|
||||
|
|
|
@ -140,14 +140,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
|||
liProcessor = createLangIdProcessor(parameters);
|
||||
|
||||
doc = englishDoc();
|
||||
assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
|
||||
assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
|
||||
assertEquals("en", process(doc).getFieldValue("language"));
|
||||
assertEquals("en", process(doc).getFieldValue("languages"));
|
||||
|
||||
doc = englishDoc();
|
||||
doc.setField("language", "no");
|
||||
assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
|
||||
assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
|
||||
assertNotNull(liProcessor.process(doc).getFieldValue("text_no"));
|
||||
assertEquals("no", process(doc).getFieldValue("language"));
|
||||
assertEquals("no", process(doc).getFieldValue("languages"));
|
||||
assertNotNull(process(doc).getFieldValue("text_no"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -166,14 +166,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
|||
liProcessor = createLangIdProcessor(parameters);
|
||||
|
||||
doc = englishDoc();
|
||||
assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
|
||||
assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
|
||||
assertEquals("en", process(doc).getFieldValue("language"));
|
||||
assertEquals("en", process(doc).getFieldValue("languages"));
|
||||
|
||||
doc = englishDoc();
|
||||
doc.setField("language", "no");
|
||||
assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
|
||||
assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
|
||||
assertNotNull(liProcessor.process(doc).getFieldValue("text_multivalue_no"));
|
||||
assertEquals("no", process(doc).getFieldValue("language"));
|
||||
assertEquals("no", process(doc).getFieldValue("languages"));
|
||||
assertNotNull(process(doc).getFieldValue("text_multivalue_no"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -192,14 +192,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
|||
liProcessor = createLangIdProcessor(parameters);
|
||||
|
||||
doc = mixedEnglishRussianDoc();
|
||||
assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
|
||||
assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
|
||||
assertEquals("en", process(doc).getFieldValue("language"));
|
||||
assertEquals("en", process(doc).getFieldValue("languages"));
|
||||
|
||||
doc = mixedEnglishRussianDoc();
|
||||
doc.setField("language", "no");
|
||||
assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
|
||||
assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
|
||||
assertNotNull(liProcessor.process(doc).getFieldValue("text_multivalue_no"));
|
||||
assertEquals("no", process(doc).getFieldValue("language"));
|
||||
assertEquals("no", process(doc).getFieldValue("languages"));
|
||||
assertNotNull(process(doc).getFieldValue("text_multivalue_no"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -212,7 +212,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
|||
liProcessor = createLangIdProcessor(parameters);
|
||||
|
||||
doc = tooShortDoc();
|
||||
assertEquals("", liProcessor.process(doc).getFieldValue("language"));
|
||||
assertEquals("", process(doc).getFieldValue("language"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -225,7 +225,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
|||
liProcessor = createLangIdProcessor(parameters);
|
||||
|
||||
doc = new SolrInputDocument();
|
||||
assertEquals("", liProcessor.process(doc).getFieldValue("language"));
|
||||
assertEquals("", process(doc).getFieldValue("language"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -242,11 +242,11 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
|||
// Verify fallback to field fb (noop field does not exist and is skipped)
|
||||
doc = tooShortDoc();
|
||||
doc.addField("fb", "fbField");
|
||||
assertEquals("fbField", liProcessor.process(doc).getFieldValue("language"));
|
||||
assertEquals("fbField", process(doc).getFieldValue("language"));
|
||||
|
||||
// Verify fallback to fallback value since no fallback fields exist
|
||||
doc = tooShortDoc();
|
||||
assertEquals("fbVal", liProcessor.process(doc).getFieldValue("language"));
|
||||
assertEquals("fbVal", process(doc).getFieldValue("language"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -272,6 +272,60 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
|||
assertEquals("fallback", liProcessor.resolveLanguage(langs, "fallback"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeepOrig() throws Exception {
|
||||
ModifiableSolrParams parameters = new ModifiableSolrParams();
|
||||
parameters.set("langid.enforceSchema", "false");
|
||||
parameters.set("langid.langField", "language");
|
||||
parameters.set("langid.langsField", "languages");
|
||||
parameters.set("langid.fl", "text");
|
||||
parameters.set("langid.map", "true");
|
||||
parameters.set("langid.map.keepOrig", "false");
|
||||
liProcessor = createLangIdProcessor(parameters);
|
||||
|
||||
SolrInputDocument mappedNoOrig = process(englishDoc());
|
||||
assertEquals("text_en", liProcessor.getMappedField("text", "en"));
|
||||
assertEquals("en", mappedNoOrig.getFieldValue("language"));
|
||||
assertTrue(mappedNoOrig.containsKey("text_en"));
|
||||
assertFalse(mappedNoOrig.containsKey("text"));
|
||||
|
||||
// keepOrig true
|
||||
parameters.set("langid.map.keepOrig", "true");
|
||||
liProcessor = createLangIdProcessor(parameters);
|
||||
|
||||
SolrInputDocument mappedKeepOrig = process(englishDoc());
|
||||
assertTrue(mappedKeepOrig.containsKey("text_en"));
|
||||
assertTrue(mappedKeepOrig.containsKey("text"));
|
||||
assertEquals(englishDoc().getFieldValue("text"), mappedKeepOrig.getFieldValue("text_en"));
|
||||
|
||||
// keepOrig and map individual
|
||||
parameters.set("langid.map.individual", "true");
|
||||
parameters.set("langid.fl", "text,text2");
|
||||
liProcessor = createLangIdProcessor(parameters);
|
||||
|
||||
SolrInputDocument mappedIndividual = process(languagePerFieldDoc());
|
||||
assertTrue(mappedIndividual.containsKey("text_en"));
|
||||
assertTrue(mappedIndividual.containsKey("text"));
|
||||
assertTrue(mappedIndividual.containsKey("text2_ru"));
|
||||
assertTrue(mappedIndividual.containsKey("text2"));
|
||||
assertEquals(languagePerFieldDoc().getFieldValue("text"), mappedIndividual.getFieldValue("text_en"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMapIndividual() throws Exception {
|
||||
ModifiableSolrParams parameters = new ModifiableSolrParams();
|
||||
parameters.set("langid.enforceSchema", "false");
|
||||
parameters.set("langid.langField", "language");
|
||||
parameters.set("langid.langsField", "languages");
|
||||
parameters.set("langid.fl", "text,text2");
|
||||
parameters.set("langid.map", "true");
|
||||
parameters.set("langid.map.individual", "true");
|
||||
liProcessor = createLangIdProcessor(parameters);
|
||||
|
||||
SolrInputDocument mappedIndividual = process(languagePerFieldDoc());
|
||||
assertTrue(mappedIndividual.containsKey("text_en"));
|
||||
assertTrue(mappedIndividual.containsKey("text2_ru"));
|
||||
}
|
||||
|
||||
// Various utility methods
|
||||
|
||||
|
@ -282,6 +336,12 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
|||
return doc;
|
||||
}
|
||||
|
||||
private SolrInputDocument languagePerFieldDoc() {
|
||||
SolrInputDocument doc = englishDoc();
|
||||
doc.addField("text2", "The Apache Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска, написанная на Java. Может быть использована для поиска в интернете и других областях компьютерной лингвистики (аналитическая философия).");
|
||||
return doc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct document containing multi-value fields in different languages.
|
||||
* @return solr input document
|
||||
|
@ -307,7 +367,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
|||
if(liProcessor == null)
|
||||
throw new Exception("Processor must be initialized before calling assertLang()");
|
||||
SolrInputDocument doc = sid(fieldsAndValues);
|
||||
assertEquals(langCode, liProcessor.process(doc).getFieldValue(liProcessor.langField));
|
||||
assertEquals(langCode, process(doc).getFieldValue(liProcessor.langField));
|
||||
}
|
||||
|
||||
private SolrInputDocument sid(String... fieldsAndValues) {
|
||||
|
@ -317,4 +377,13 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
|||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
/*
|
||||
Utility test method to process a clone of a document
|
||||
*/
|
||||
private SolrInputDocument process(SolrInputDocument origDoc) {
|
||||
SolrInputDocument modifiedDoc = origDoc.deepCopy();
|
||||
liProcessor.process(modifiedDoc);
|
||||
return modifiedDoc;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.update.processor;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.junit.Assert.assertArrayEquals;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class SolrInputDocumentReaderTest {
|
||||
private SolrInputDocument doc;
|
||||
private String[] allFields;
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
doc = new SolrInputDocument();
|
||||
doc.addField("f1", "a b c");
|
||||
doc.addField("f2", "multi");
|
||||
doc.addField("f2", "valued");
|
||||
doc.addField("f2", "field");
|
||||
doc.addField("f3", 123);
|
||||
doc.addField("f4", "12345678901234567890");
|
||||
allFields = new String[] {"f1", "f2", "f3", "f4"};
|
||||
}
|
||||
|
||||
@Test
|
||||
public void readChunked() throws Exception {
|
||||
SolrInputDocumentReader reader = new SolrInputDocumentReader(
|
||||
doc,
|
||||
allFields,
|
||||
20,
|
||||
18,
|
||||
" - ");
|
||||
assertTrue(reader.ready());
|
||||
char[] chars = new char[1000];
|
||||
int len;
|
||||
assertEquals(9, len=reader.read(chars, 0, 9));
|
||||
assertArrEqu("a b c - m", chars, len);
|
||||
len += reader.read(chars, 9, 2);
|
||||
assertArrEqu("a b c - mul", chars, len);
|
||||
len += reader.read(chars, 11, 1);
|
||||
assertArrEqu("a b c - mult", chars, len);
|
||||
len += reader.read(chars, 12, 10);
|
||||
// We now hit totalMaxChars
|
||||
assertArrEqu("a b c - multi - valu", chars, len);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void maxFieldValueLength() throws Exception {
|
||||
SolrInputDocumentReader reader = new SolrInputDocumentReader(
|
||||
doc,
|
||||
allFields,
|
||||
21,
|
||||
2,
|
||||
" - "
|
||||
);
|
||||
assertTrue(reader.ready());
|
||||
char[] chars = new char[1000];
|
||||
int len = reader.read(chars, 0, 22);
|
||||
assertEquals(21, len);
|
||||
assertArrEqu("a - mu - va - fi - 1", chars, len);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void allStrFields() throws Exception {
|
||||
SolrInputDocumentReader reader = new SolrInputDocumentReader(
|
||||
doc,
|
||||
20000,
|
||||
10000
|
||||
);
|
||||
assertTrue(reader.ready());
|
||||
char[] chars = new char[1000];
|
||||
int len = reader.read(chars, 0, 1000);
|
||||
assertEquals(45, len);
|
||||
assertArrEqu("a b c multi valued field 12345678901234567890", chars, len);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetStringFields() throws Exception {
|
||||
String[] expected = new String[] {"f1", "f2", "f4"};
|
||||
assertArrayEquals(expected, SolrInputDocumentReader.getStringFields(doc));
|
||||
}
|
||||
|
||||
private void assertArrEqu(String expected, char[] chars, int len) {
|
||||
String str = new String(Arrays.copyOf(chars, len));
|
||||
assertEquals(expected, str);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue