SOLR-1979: Create LanguageIdentifierUpdateProcessor

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1179416 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jan Høydahl 2011-10-05 20:21:59 +00:00
parent 54eba0f549
commit b7aee61754
14 changed files with 1070 additions and 0 deletions

View File

@ -71,6 +71,9 @@
<classpathentry kind="src" path="solr/contrib/extraction/src/java"/>
<classpathentry kind="src" path="solr/contrib/extraction/src/test"/>
<classpathentry kind="src" path="solr/contrib/extraction/src/test-files"/>
<classpathentry kind="src" path="solr/contrib/langid/src/java"/>
<classpathentry kind="src" path="solr/contrib/langid/src/test"/>
<classpathentry kind="src" path="solr/contrib/langid/src/test-files"/>
<classpathentry kind="src" path="solr/contrib/uima/src/java"/>
<classpathentry kind="src" path="solr/contrib/uima/src/resources"/>
<classpathentry kind="src" path="solr/contrib/uima/src/test"/>

View File

@ -354,6 +354,9 @@ New Features
* SOLR-2769: Added factory for the new Hunspell stemmer capable of doing stemming
for 99 languages (janhoy, cmale)
* SOLR-1979: New contrib "langid". Adds language identification capabilities as an
Update Processor, using Tika's LanguageIdentifier (janhoy, Tommaso Teofili, gsingers)
Bug Fixes
----------------------
* SOLR-2748: The CommitTracker used for commitWith or autoCommit by maxTime

View File

@ -456,12 +456,14 @@
<packageset dir="contrib/dataimporthandler/src/java"/>
<packageset dir="contrib/dataimporthandler-extras/src/java"/>
<packageset dir="contrib/extraction/src/java"/>
<packageset dir="contrib/langid/src/java"/>
<packageset dir="contrib/uima/src/java"/>
<group title="Core" packages="org.apache.*" />
<group title="SolrJ" packages="org.apache.solr.common.*,org.apache.solr.client.solrj*" />
<group title="contrib: Clustering" packages="org.apache.solr.handler.clustering*" />
<group title="contrib: DataImportHandler" packages="org.apache.solr.handler.dataimport*" />
<group title="contrib: Solr Cell" packages="org.apache.solr.handler.extraction*" />
<group title="contrib: Solr LangId" packages="org.apache.solr.update.processor.LanguageIdentifier*,org.apache.solr.update.processor.LangIdParams*,org.apache.solr.update.processor.DetectedLanguage*" />
<group title="contrib: Solr UIMA" packages="org.apache.solr.uima*" />
</sources>
</invoke-javadoc>

View File

@ -0,0 +1,15 @@
Apache Solr Language Identifier
Release Notes
This file describes changes to the SolrTika Language Identifier (contrib/langid) module.
See http://wiki.apache.org/solr/LanguageDetection for details
$Id$
================== Release 3.5.0 ==================
Initial release. See README.txt.
* SOLR-1979: New contrib "langid". Adds language identification capabilities as an
Update Processor, using Tika's LanguageIdentifier (janhoy, Tommaso Teofili, gsingers)

View File

@ -0,0 +1,23 @@
Apache Solr Language Identifier
Introduction
------------
This module is intended to be used while indexing documents.
It is implemented as an UpdateProcessor to be placed in an UpdateChain.
Its purpose is to identify language from documents and tag the document with language code.
The module can optionally map field names to their language specific counterpart,
e.g. if the input is "title" and language is detected as "en", map to "title_en".
Language may be detected globally for the document, and/or individually per field.
The module currently relies on Tika's language identification capabilities.
Getting Started
---------------
Please refer to the module documentation at http://wiki.apache.org/solr/LanguageDetection
Dependencies
------------
This contrib depends on Tika Core, which is part of extraction contrib.
The easiest is thus to first install extraction contrib and then langid.
Alternatively you can include tika-core manually on your classpath.

View File

@ -0,0 +1,33 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="solr-langid" default="default">
<description>
Language Identifier contrib for extracting language from a document being indexed
</description>
<import file="../contrib-build.xml"/>
<path id="classpath">
<fileset dir="../extraction/lib" includes="*.jar"/>
<path refid="solr.base.classpath"/>
</path>
</project>

View File

@ -0,0 +1,47 @@
package org.apache.solr.update.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Bean holding a language and a detection certainty
*/
public class DetectedLanguage {
private final String langCode;
private final Double certainty;
DetectedLanguage(String lang, Double certainty) {
this.langCode = lang;
this.certainty = certainty;
}
/**
* Returns the detected language code
* @return language code as a string
*/
public String getLangCode() {
return langCode;
}
/**
* Returns the detected certainty for this language
* @return certainty as a value between 0.0 and 1.0 where 1.0 is 100% certain
*/
public Double getCertainty() {
return certainty;
}
}

View File

@ -0,0 +1,53 @@
package org.apache.solr.update.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public interface LangIdParams {
String LANGUAGE_ID = "langid";
String DOCID_PARAM = LANGUAGE_ID + ".idField";
String FIELDS_PARAM = LANGUAGE_ID + ".fl"; // Field list to detect from
String LANG_FIELD = LANGUAGE_ID + ".langField"; // Main language detected
String LANGS_FIELD = LANGUAGE_ID + ".langsField"; // All languages detected (multiValued)
String FALLBACK = LANGUAGE_ID + ".fallback"; // Fallback lang code
String FALLBACK_FIELDS = LANGUAGE_ID + ".fallbackFields"; // Comma-sep list of fallback fields
String OVERWRITE = LANGUAGE_ID + ".overwrite"; // Overwrite if existing language value in LANG_FIELD
String THRESHOLD = LANGUAGE_ID + ".threshold"; // Detection threshold
String ENFORCE_SCHEMA = LANGUAGE_ID + ".enforceSchema"; // Enforces that output fields exist in schema
String LANG_WHITELIST = LANGUAGE_ID + ".whitelist"; // Allowed languages
String MAP_ENABLE = LANGUAGE_ID + ".map"; // Turns on or off the field mapping
String MAP_FL = LANGUAGE_ID + ".map.fl"; // Field list for mapping
String MAP_OVERWRITE = LANGUAGE_ID + ".map.overwrite"; // Whether to overwrite existing fields
String MAP_KEEP_ORIG = LANGUAGE_ID + ".map.keepOrig"; // Keep original field after mapping
String MAP_INDIVIDUAL = LANGUAGE_ID + ".map.individual"; // Detect language per individual field
String MAP_INDIVIDUAL_FL = LANGUAGE_ID + ".map.individual.fl";// Field list of fields to redetect language for
String MAP_LCMAP = LANGUAGE_ID + ".map.lcmap"; // Enables mapping multiple langs to same output field
String MAP_PATTERN = LANGUAGE_ID + ".map.pattern"; // RegEx pattern to match field name
String MAP_REPLACE = LANGUAGE_ID + ".map.replace"; // Replace pattern
String DOCID_FIELD_DEFAULT = "id";
String DOCID_LANGFIELD_DEFAULT = null;
String DOCID_LANGSFIELD_DEFAULT = null;
String MAP_PATTERN_DEFAULT = "(.*)";
String MAP_REPLACE_DEFAULT = "$1_{lang}";
// TODO: This default threshold accepts even "uncertain" detections.
// Increase &langid.threshold above 0.5 to return only certain detections
Double DOCID_THRESHOLD_DEFAULT = 0.5;
}

View File

@ -0,0 +1,396 @@
package org.apache.solr.update.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.tika.language.LanguageIdentifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;
/**
* Identifies the language of a set of input fields using Tika's
* LanguageIdentifier. Also supports mapping of field names based
* on detected language.
* The tika-core-x.y.jar must be on the classpath
* <p>
* See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
* @since 3.5
*/
public class LanguageIdentifierUpdateProcessor extends UpdateRequestProcessor implements LangIdParams {
protected final static Logger log = LoggerFactory
.getLogger(LanguageIdentifierUpdateProcessor.class);
protected boolean enabled;
protected String[] inputFields = {};
protected String[] mapFields = {};
protected Pattern mapPattern;
protected String mapReplaceStr;
protected String langField;
protected String langsField; // MultiValued, contains all languages detected
protected String docIdField;
protected String fallbackValue;
protected String[] fallbackFields = {};
protected boolean enableMapping;
protected boolean mapKeepOrig;
protected boolean overwrite;
protected boolean mapOverwrite;
protected boolean mapIndividual;
protected boolean enforceSchema;
protected double threshold;
protected HashSet<String> langWhitelist;
protected HashSet<String> mapIndividualFieldsSet;
protected HashSet<String> allMapFieldsSet;
protected HashMap<String,String> lcMap;
protected IndexSchema schema;
// Regex patterns
protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)");
protected final Pattern langPattern = Pattern.compile("\\{lang\\}");
public LanguageIdentifierUpdateProcessor(SolrQueryRequest req,
SolrQueryResponse rsp, UpdateRequestProcessor next) {
super(next);
schema = req.getSchema();
initParams(req.getParams());
}
private void initParams(SolrParams params) {
if (params != null) {
// Document-centric langId params
setEnabled(params.getBool(LANGUAGE_ID, true));
if(params.get(FIELDS_PARAM, "").length() > 0) {
inputFields = params.get(FIELDS_PARAM, "").split(",");
}
langField = params.get(LANG_FIELD, DOCID_LANGFIELD_DEFAULT);
langsField = params.get(LANGS_FIELD, DOCID_LANGSFIELD_DEFAULT);
docIdField = params.get(DOCID_PARAM, DOCID_FIELD_DEFAULT);
fallbackValue = params.get(FALLBACK);
if(params.get(FALLBACK_FIELDS, "").length() > 0) {
fallbackFields = params.get(FALLBACK_FIELDS).split(",");
}
overwrite = params.getBool(OVERWRITE, false);
langWhitelist = new HashSet<String>();
threshold = params.getDouble(THRESHOLD, DOCID_THRESHOLD_DEFAULT);
if(params.get(LANG_WHITELIST, "").length() > 0) {
for(String lang : params.get(LANG_WHITELIST, "").split(",")) {
langWhitelist.add(lang);
}
}
// Mapping params (field centric)
enableMapping = params.getBool(MAP_ENABLE, false);
if(params.get(MAP_FL, "").length() > 0) {
mapFields = params.get(MAP_FL, "").split(",");
} else {
mapFields = inputFields;
}
mapKeepOrig = params.getBool(MAP_KEEP_ORIG, false);
mapOverwrite = params.getBool(MAP_OVERWRITE, false);
mapIndividual = params.getBool(MAP_INDIVIDUAL, false);
// Process individual fields
String[] mapIndividualFields = {};
if(params.get(MAP_INDIVIDUAL_FL, "").length() > 0) {
mapIndividualFields = params.get(MAP_INDIVIDUAL_FL, "").split(",");
} else {
mapIndividualFields = mapFields;
}
mapIndividualFieldsSet = new HashSet<String>(Arrays.asList(mapIndividualFields));
// Compile a union of the lists of fields to map
allMapFieldsSet = new HashSet<String>(Arrays.asList(mapFields));
if(Arrays.equals(mapFields, mapIndividualFields)) {
allMapFieldsSet.addAll(mapIndividualFieldsSet);
}
// Language Code mapping
lcMap = new HashMap<String,String>();
if(params.get(MAP_LCMAP) != null) {
for(String mapping : params.get(MAP_LCMAP).split("[, ]")) {
String[] keyVal = mapping.split(":");
if(keyVal.length == 2) {
lcMap.put(keyVal[0], keyVal[1]);
} else {
log.error("Unsupported format for langid.map.lcmap: "+mapping+". Skipping this mapping.");
}
}
}
enforceSchema = params.getBool(ENFORCE_SCHEMA, true);
mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT));
mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT);
}
log.debug("LangId configured");
if (inputFields.length == 0) {
throw new SolrException(ErrorCode.BAD_REQUEST,
"Missing or faulty configuration of LanguageIdentifierUpdateProcessor. Input fields must be specified as a comma separated list");
}
}
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
if (isEnabled()) {
process(cmd.getSolrInputDocument());
} else {
log.debug("Processor not enabled, not running");
}
super.processAdd(cmd);
}
/**
* This is the main, testable process method called from processAdd()
* @param doc the SolrInputDocument to work on
* @return the modified SolrInputDocument
*/
protected SolrInputDocument process(SolrInputDocument doc) {
String docLang = null;
HashSet<String> docLangs = new HashSet<String>();
String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) {
String allText = concatFields(doc, inputFields);
List<DetectedLanguage> languagelist = detectLanguage(allText);
docLang = resolveLanguage(languagelist, fallbackLang);
docLangs.add(docLang);
log.debug("Detected main document language from fields "+inputFields+": "+docLang);
if(doc.containsKey(langField) && overwrite) {
log.debug("Overwritten old value "+doc.getFieldValue(langField));
}
if(langField != null && langField.length() != 0) {
doc.setField(langField, docLang);
}
} else {
// langField is set, we sanity check it against whitelist and fallback
docLang = resolveLanguage((String) doc.getFieldValue(langField), fallbackLang);
docLangs.add(docLang);
log.debug("Field "+langField+" already contained value "+docLang+", not overwriting.");
}
if(enableMapping) {
for (String fieldName : allMapFieldsSet) {
if(doc.containsKey(fieldName)) {
String fieldLang;
if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
String text = (String) doc.getFieldValue(fieldName);
List<DetectedLanguage> languagelist = detectLanguage(text);
fieldLang = resolveLanguage(languagelist, docLang);
docLangs.add(fieldLang);
log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
} else {
fieldLang = docLang;
log.debug("Mapping field "+fieldName+" using document global language "+fieldLang);
}
String mappedOutputField = getMappedField(fieldName, fieldLang);
if(enforceSchema && schema.getFieldOrNull(fieldName) == null) {
log.warn("Unsuccessful field name mapping to {}, field does not exist, skipping mapping.", mappedOutputField, fieldName);
mappedOutputField = fieldName;
}
if (mappedOutputField != null) {
log.debug("Mapping field {} to {}", doc.getFieldValue(docIdField), fieldLang);
SolrInputField inField = doc.getField(fieldName);
doc.setField(mappedOutputField, inField.getValue(), inField.getBoost());
if(!mapKeepOrig) {
log.debug("Removing old field {}", fieldName);
doc.removeField(fieldName);
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Invalid output field mapping for "
+ fieldName + " field and language: " + fieldLang);
}
} else {
log.warn("Document {} does not contain input field {}. Skipping this field.", doc.getFieldValue(docIdField), fieldName);
}
}
}
// Set the languages field to an array of all detected languages
if(langsField != null && langsField.length() != 0) {
doc.setField(langsField, docLangs.toArray());
}
return doc;
}
/**
* Decides the fallback language, either from content of fallback field or fallback value
* @param doc the Solr document
* @param fallbackFields an array of strings with field names containing fallback language codes
* @param fallbackValue a language code to use in case no fallbackFields are found
*/
private String getFallbackLang(SolrInputDocument doc, String[] fallbackFields, String fallbackValue) {
String lang = null;
for(String field : fallbackFields) {
if(doc.containsKey(field)) {
lang = (String) doc.getFieldValue(field);
log.debug("Language fallback to field "+field);
break;
}
}
if(lang == null) {
log.debug("Language fallback to value "+fallbackValue);
lang = fallbackValue;
}
return lang;
}
/*
* Concatenates content from multiple fields
*/
protected String concatFields(SolrInputDocument doc, String[] fields) {
StringBuffer sb = new StringBuffer();
for (String fieldName : inputFields) {
log.debug("Appending field "+fieldName);
if (doc.containsKey(fieldName)) {
Object content = doc.getFieldValue(fieldName);
if(content instanceof String) {
sb.append((String) doc.getFieldValue(fieldName));
sb.append(" ");
} else {
log.warn("Field "+fieldName+" not a String value, not including in detection");
}
}
}
return sb.toString();
}
/**
* Detects language(s) from a string.
* Classes wishing to implement their own language detection module should override this method.
* @param content The content to identify
* @return List of detected language(s) according to RFC-3066
*/
protected List<DetectedLanguage> detectLanguage(String content) {
List<DetectedLanguage> languages = new ArrayList<DetectedLanguage>();
if(content.trim().length() != 0) {
LanguageIdentifier identifier = new LanguageIdentifier(content.toString());
// FIXME: Hack - we get the distance from toString and calculate our own certainty score
Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
// This formula gives: 0.02 => 0.8, 0.1 => 0.5 which is a better sweetspot than isReasonablyCertain()
Double certainty = 1 - (5 * distance);
certainty = (certainty < 0) ? 0 : certainty;
DetectedLanguage language = new DetectedLanguage(identifier.getLanguage(), certainty);
languages.add(language);
log.debug("Language detected as "+language+" with a certainty of "+language.getCertainty()+" (Tika distance="+identifier.toString()+")");
} else {
log.debug("No input text to detect language from, returning empty list");
}
return languages;
}
/**
* Chooses a language based on the list of candidates detected
* @param language language code as a string
* @param fallbackLang the language code to use as a fallback
* @return a string of the chosen language
*/
protected String resolveLanguage(String language, String fallbackLang) {
List<DetectedLanguage> l = new ArrayList<DetectedLanguage>();
l.add(new DetectedLanguage(language, 1.0));
return resolveLanguage(l, fallbackLang);
}
/**
* Chooses a language based on the list of candidates detected
* @param languages a List of DetectedLanguages with certainty score
* @param fallbackLang the language code to use as a fallback
* @return a string of the chosen language
*/
protected String resolveLanguage(List<DetectedLanguage> languages, String fallbackLang) {
String langStr;
if(languages.size() == 0) {
log.debug("No language detected, using fallback {}", fallbackLang);
langStr = fallbackLang;
} else {
DetectedLanguage lang = languages.get(0);
if(langWhitelist.isEmpty() || langWhitelist.contains(lang.getLangCode())) {
log.debug("Language detected {} with certainty {}", lang.getLangCode(), lang.getCertainty());
if(lang.getCertainty() >= threshold) {
langStr = lang.getLangCode();
} else {
log.debug("Detected language below threshold {}, using fallback {}", threshold, fallbackLang);
langStr = fallbackLang;
}
} else {
log.debug("Detected a language not in whitelist ({}), using fallback {}", lang.getLangCode(), fallbackLang);
langStr = fallbackLang;
}
}
if(langStr == null || langStr.length() == 0) {
log.warn("Language resolved to null or empty string. Fallback not configured?");
langStr = "";
}
return langStr;
}
/**
* Returns the name of the field to map the current contents into, so that they are properly analyzed. For instance
* if the currentField is "text" and the code is "en", the new field would be "text_en". If such a field doesn't exist,
* then null is returned.
*
* @param currentField The current field name
* @param language the language code
* @return The new schema field name, based on pattern and replace
*/
protected String getMappedField(String currentField, String language) {
String lc = lcMap.containsKey(language) ? lcMap.get(language) : language;
String newFieldName = langPattern.matcher(mapPattern.matcher(currentField).replaceFirst(mapReplaceStr)).replaceFirst(lc);
log.debug("Doing mapping from "+currentField+" with language "+language+" to field "+newFieldName);
return newFieldName;
}
/**
* Tells if this processor is enabled or not
* @return true if enabled, else false
*/
public boolean isEnabled() {
return enabled;
}
public void setEnabled(boolean enabled) {
this.enabled = enabled;
}
}

View File

@ -0,0 +1,94 @@
package org.apache.solr.update.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.SolrCoreAware;
/**
* Identifies the language of a set of input fields using Tika's
* LanguageIdentifier. The tika-core-x.y.jar must be on the classpath
* <p/>
* The UpdateProcessorChain config entry can take a number of parameters
* which may also be passed as HTTP parameters on the update request
* and override the defaults. Here is the simplest processor config possible:
*
* <pre class="prettyprint" >
* &lt;processor class=&quot;org.apache.solr.update.processor.LanguageIdentifierUpdateProcessorFactory&quot;&gt;
* &lt;str name=&quot;langid.fl&quot;&gt;title,text&lt;/str&gt;
* &lt;str name=&quot;langid.langField&quot;&gt;language_s&lt;/str&gt;
* &lt;/processor&gt;
* </pre>
* See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
* @since 3.5
*/
public class LanguageIdentifierUpdateProcessorFactory extends
UpdateRequestProcessorFactory implements SolrCoreAware, LangIdParams {
protected SolrParams defaults;
protected SolrParams appends;
protected SolrParams invariants;
@Override
public void inform(SolrCore core) {
}
/**
* The UpdateRequestProcessor may be initialized in solrconfig.xml similarly
* to a RequestHandler, with defaults, appends and invariants.
* @param args a NamedList with the configuration parameters
*/
@SuppressWarnings("rawtypes")
public void init( NamedList args )
{
if (args != null) {
Object o;
o = args.get("defaults");
if (o != null && o instanceof NamedList) {
defaults = SolrParams.toSolrParams((NamedList) o);
} else {
defaults = SolrParams.toSolrParams(args);
}
o = args.get("appends");
if (o != null && o instanceof NamedList) {
appends = SolrParams.toSolrParams((NamedList) o);
}
o = args.get("invariants");
if (o != null && o instanceof NamedList) {
invariants = SolrParams.toSolrParams((NamedList) o);
}
}
}
@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest req,
SolrQueryResponse rsp, UpdateRequestProcessor next) {
// Process defaults, appends and invariants if we got a request
if(req != null) {
SolrPluginUtils.setDefaults(req, defaults, appends, invariants);
}
return new LanguageIdentifierUpdateProcessor(req, rsp, next);
}
}

View File

@ -0,0 +1,81 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- The Solr schema file. This file should be named "schema.xml" and
should be located where the classloader for the Solr webapp can find it.
This schema is used for testing, and as such has everything and the
kitchen sink thrown in. See example/solr/conf/schema.xml for a
more concise example.
$Id: schema.xml 382610 2006-03-03 01:43:03Z yonik $
$Source: /cvs/main/searching/solr-configs/test/WEB-INF/classes/schema.xml,v $
$Name: $
-->
<schema name="test" version="1.4">
<types>
<fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>
<!-- solr.TextField allows the specification of custom
text analyzers specified as a tokenizer and a list
of token filters.
-->
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="nametext" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
</fieldtype>
</types>
<fields>
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="name" type="nametext" indexed="true" stored="true"/>
<field name="subject" type="text" indexed="true" stored="true"/>
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have
a "*" only at the start or the end.
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
Longer patterns will be matched first. if equal size patterns
both match, the first appearing in the schema will be used.
-->
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_sm" type="string" indexed="true" stored="true" multiValued="true" />
<!-- Fields for language identification -->
<dynamicField name="name_*" type="string" indexed="true" stored="false"/>
<dynamicField name="subject_*" type="string" indexed="true" stored="false"/>
</fields>
<defaultSearchField>name</defaultSearchField>
<uniqueKey>id</uniqueKey>
</schema>

View File

@ -0,0 +1,81 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<config>
<jmx />
<!-- Used to specify an alternate directory to hold all index data.
It defaults to "index" if not present, and should probably
not be changed if replication is in use. -->
<dataDir>${solr.data.dir:./solr/data}</dataDir>
<!-- The DirectoryFactory to use for indexes.
solr.StandardDirectoryFactory, the default, is filesystem based.
solr.RAMDirectoryFactory is memory based and not persistent. -->
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
<luceneMatchVersion>LUCENE_40</luceneMatchVersion>
<requestHandler name="search" class="solr.SearchHandler" default="true">
<lst name="defaults">
<str name="echoParams">explicit</str>
<int name="rows">10</int>
</lst>
</requestHandler>
<requestHandler name="standard" class="solr.StandardRequestHandler">
<bool name="httpCaching">true</bool>
</requestHandler>
<requestHandler name="/admin/" class="org.apache.solr.handler.admin.AdminHandlers" />
<!-- enable streaming for testing... -->
<requestDispatcher handleSelect="true" >
<requestParsers enableRemoteStreaming="true" multipartUploadLimitInKB="2048" />
<httpCaching lastModifiedFrom="openTime" etagSeed="Solr" never304="false">
<cacheControl>max-age=30, public</cacheControl>
</httpCaching>
</requestDispatcher>
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler">
<lst name="defaults">
<str name="update.chain">lang_id</str>
</lst>
</requestHandler>
<updateRequestProcessorChain name="lang_id">
<processor class="org.apache.solr.update.processor.LanguageIdentifierUpdateProcessorFactory">
<!-- Can take defaults, invariants and appends just like req handlers-->
<lst name="defaults">
<bool name="langid">true</bool>
<str name="langid.fl">name,subject</str>
<bool name="langid.map">true</bool>
<str name="langid.langField">language_s</str>
<str name="langid.langsField">language_sm</str>
<str name="langid.map.lcmap">th:thai</str>
<float name="threshold">0.5</float>
<str name="langid.fallback">fallback</str>
</lst>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
</config>

View File

@ -0,0 +1,216 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.util.ArrayList;
import java.util.List;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.core.SolrCore;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.servlet.SolrRequestParsers;
public class LanguageIdentifierUpdateProcessorFactoryTest extends SolrTestCaseJ4 {
protected static SolrRequestParsers _parser;
protected static SolrQueryRequest req;
protected static SolrQueryResponse resp = new SolrQueryResponse();
protected static LanguageIdentifierUpdateProcessor liProcessor;
protected static ModifiableSolrParams parameters;
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-languageidentifier.xml", "schema.xml", getFile("langid/solr").getAbsolutePath());
SolrCore core = h.getCore();
UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("lang_id");
assertNotNull(chained);
_parser = new SolrRequestParsers(null);
}
@Override
@Before
public void setUp() throws Exception {
super.setUp();
clearIndex();
assertU(commit());
}
@Test
public void testLangIdGlobal() throws Exception {
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "name,subject");
parameters.add("langid.langField", "language_s");
parameters.add("langid.fallback", "un");
liProcessor = createLangIdProcessor(parameters);
assertLang("no", "id", "1no", "name", "Lucene", "subject", "Lucene er et fri/åpen kildekode programvarebibliotek for informasjonsgjenfinning, opprinnelig utviklet i programmeringsspråket Java av Doug Cutting. Lucene støttes av Apache Software Foundation og utgis under Apache-lisensen.");
assertLang("en", "id", "2en", "name", "Lucene", "subject", "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.");
assertLang("sv", "id", "3sv", "name", "Maven", "subject", "Apache Maven är ett verktyg utvecklat av Apache Software Foundation och används inom systemutveckling av datorprogram i programspråket Java. Maven används för att automatiskt paketera (bygga) programfilerna till en distribuerbar enhet. Maven används inom samma område som Apache Ant men dess byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade.");
assertLang("es", "id", "4es", "name", "Lucene", "subject", "Lucene es un API de código abierto para recuperación de información, originalmente implementada en Java por Doug Cutting. Está apoyado por el Apache Software Foundation y se distribuye bajo la Apache Software License. Lucene tiene versiones para otros lenguajes incluyendo Delphi, Perl, C#, C++, Python, Ruby y PHP.");
assertLang("un", "id", "5un", "name", "a", "subject", "b");
assertLang("th", "id", "6th", "name", "บทความคัดสรรเดือนนี้", "subject", "อันเนอลีส มารี อันเนอ ฟรังค์ หรือมักรู้จักในภาษาไทยว่า แอนน์ แฟรงค์ เป็นเด็กหญิงชาวยิว เกิดที่เมืองแฟรงก์เฟิร์ต ประเทศเยอรมนี เธอมีชื่อเสียงโด่งดังในฐานะผู้เขียนบันทึกประจำวันซึ่งต่อมาได้รับการตีพิมพ์เป็นหนังสือ บรรยายเหตุการณ์ขณะหลบซ่อนตัวจากการล่าชาวยิวในประเทศเนเธอร์แลนด์ ระหว่างที่ถูกเยอรมนีเข้าครอบครองในช่วงสงครามโลกครั้งที่สอง");
assertLang("ru", "id", "7ru", "name", "Lucene", "subject", "The Apache Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска, написанная на Java. Может быть использована для поиска в интернете и других областях компьютерной лингвистики (аналитическая философия).");
assertLang("de", "id", "8de", "name", "Lucene", "subject", "Lucene ist ein Freie-Software-Projekt der Apache Software Foundation, das eine Suchsoftware erstellt. Durch die hohe Leistungsfähigkeit und Skalierbarkeit können die Lucene-Werkzeuge für beliebige Projektgrößen und Anforderungen eingesetzt werden. So setzt beispielsweise Wikipedia Lucene für die Volltextsuche ein. Zudem verwenden die beiden Desktop-Suchprogramme Beagle und Strigi eine C#- bzw. C++- Portierung von Lucene als Indexer.");
assertLang("fr", "id", "9fr", "name", "Lucene", "subject", "Lucene est un moteur de recherche libre écrit en Java qui permet d'indexer et de rechercher du texte. C'est un projet open source de la fondation Apache mis à disposition sous licence Apache. Il est également disponible pour les langages Ruby, Perl, C++, PHP.");
assertLang("nl", "id", "10nl", "name", "Lucene", "subject", "Lucene is een gratis open source, tekst gebaseerde information retrieval API van origine geschreven in Java door Doug Cutting. Het wordt ondersteund door de Apache Software Foundation en is vrijgegeven onder de Apache Software Licentie. Lucene is ook beschikbaar in andere programeertalen zoals Perl, C#, C++, Python, Ruby en PHP.");
assertLang("it", "id", "11it", "name", "Lucene", "subject", "Lucene è una API gratuita ed open source per il reperimento di informazioni inizialmente implementata in Java da Doug Cutting. È supportata dall'Apache Software Foundation ed è resa disponibile con l'Apache License. Lucene è stata successivamente reimplementata in Perl, C#, C++, Python, Ruby e PHP.");
assertLang("pt", "id", "12pt", "name", "Lucene", "subject", "Apache Lucene, ou simplesmente Lucene, é um software de busca e uma API de indexação de documentos, escrito na linguagem de programação Java. É um software de código aberto da Apache Software Foundation licenciado através da licença Apache.");
}
@Test
public void testMapFieldName() throws Exception {
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "name");
parameters.add("langid.map.lcmap", "jp:s zh:cjk ko:cjk");
parameters.add("langid.enforceSchema", "true");
liProcessor = createLangIdProcessor(parameters);
assertEquals("test_no", liProcessor.getMappedField("test", "no"));
assertEquals("test_en", liProcessor.getMappedField("test", "en"));
assertEquals("test_s", liProcessor.getMappedField("test", "jp"));
assertEquals("test_cjk", liProcessor.getMappedField("test", "zh"));
assertEquals("test_cjk", liProcessor.getMappedField("test", "ko"));
// Prove support for other mapping regex
parameters.add("langid.map.pattern", "text_(.*?)_field");
parameters.add("langid.map.replace", "$1_{lang}Text");
liProcessor = createLangIdProcessor(parameters);
assertEquals("title_noText", liProcessor.getMappedField("text_title_field", "no"));
assertEquals("body_svText", liProcessor.getMappedField("text_body_field", "sv"));
}
@Test
public void testPreExisting() throws Exception {
SolrInputDocument doc;
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "text");
parameters.add("langid.langField", "language");
parameters.add("langid.langsField", "languages");
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.map", "true");
liProcessor = createLangIdProcessor(parameters);
doc = englishDoc();
assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
doc = englishDoc();
doc.setField("language", "no");
assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
assertNotNull(liProcessor.process(doc).getFieldValue("text_no"));
}
@Test
public void testDefaultFallbackEmptyString() throws Exception {
SolrInputDocument doc;
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "text");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
liProcessor = createLangIdProcessor(parameters);
doc = tooShortDoc();
assertEquals("", liProcessor.process(doc).getFieldValue("language"));
}
@Test
public void testFallback() throws Exception {
SolrInputDocument doc;
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "text");
parameters.add("langid.langField", "language");
parameters.add("langid.fallbackFields", "noop,fb");
parameters.add("langid.fallback", "fbVal");
parameters.add("langid.enforceSchema", "false");
liProcessor = createLangIdProcessor(parameters);
// Verify fallback to field fb (noop field does not exist and is skipped)
doc = tooShortDoc();
doc.addField("fb", "fbField");
assertEquals("fbField", liProcessor.process(doc).getFieldValue("language"));
// Verify fallback to fallback value since no fallback fields exist
doc = tooShortDoc();
assertEquals("fbVal", liProcessor.process(doc).getFieldValue("language"));
}
@Test
public void testResolveLanguage() throws Exception {
List<DetectedLanguage> langs;
parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "text");
parameters.add("langid.langField", "language");
liProcessor = createLangIdProcessor(parameters);
// No detected languages
langs = new ArrayList<DetectedLanguage>();
assertEquals("", liProcessor.resolveLanguage(langs, null));
assertEquals("fallback", liProcessor.resolveLanguage(langs, "fallback"));
// One detected language
langs.add(new DetectedLanguage("one", 1.0));
assertEquals("one", liProcessor.resolveLanguage(langs, "fallback"));
// One detected language under default threshold
langs = new ArrayList<DetectedLanguage>();
langs.add(new DetectedLanguage("under", 0.1));
assertEquals("fallback", liProcessor.resolveLanguage(langs, "fallback"));
}
// Various utility methods
private SolrInputDocument englishDoc() {
SolrInputDocument doc = new SolrInputDocument();
doc.addField("text", "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.");
return doc;
}
private SolrInputDocument tooShortDoc() {
SolrInputDocument doc = new SolrInputDocument();
doc.addField("text", "This text is too short");
return doc;
}
private LanguageIdentifierUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception {
return new LanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(null, parameters, null), resp, null);
}
private void assertLang(String langCode, String... fieldsAndValues) throws Exception {
if(liProcessor == null)
throw new Exception("Processor must be initialized before calling assertLang()");
SolrInputDocument doc = sid(fieldsAndValues);
assertEquals(langCode, liProcessor.process(doc).getFieldValue(liProcessor.langField));
}
private SolrInputDocument sid(String... fieldsAndValues) {
SolrInputDocument doc = new SolrInputDocument();
for (int i = 0; i < fieldsAndValues.length; i+=2) {
doc.addField(fieldsAndValues[i], fieldsAndValues[i+1]);
}
return doc;
}
}

View File

@ -62,6 +62,7 @@
-->
<lib dir="../../dist/" regex="apache-solr-cell-\d.*\.jar" />
<lib dir="../../dist/" regex="apache-solr-clustering-\d.*\.jar" />
<lib dir="../../dist/" regex="apache-solr-langid-\d.*\.jar" />
<!-- If a dir option (with or without a regex) is used and nothing
is found that matches, it will be ignored
-->
@ -1527,7 +1528,29 @@
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
-->
<!-- Language identification
This example update chain identifies the language of the incoming
documents using the langid contrib. The detected language is
written to field language_s. No field name mapping is done.
The fields used for detection are text, title, subject and description,
making this example suitable for detecting languages form full-text
rich documents injected via ExtractingRequestHandler.
See more about langId at http://wiki.apache.org/solr/LanguageDetection
-->
<!--
<updateRequestProcessorChain name="langid">
<processor class="org.apache.solr.update.processor.LanguageIdentifierUpdateProcessorFactory">
<str name="langid.fl">text,title,subject,description</str>
<str name="langid.langField">language_s</str>
<str name="langid.fallback">en</str>
</processor>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
-->
<!-- Response Writers
http://wiki.apache.org/solr/QueryResponseWriter