From b7aee61754194557e66526121c3c66b1910a55e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Wed, 5 Oct 2011 20:21:59 +0000 Subject: [PATCH] SOLR-1979: Create LanguageIdentifierUpdateProcessor git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1179416 13f79535-47bb-0310-9956-ffa450edef68 --- dev-tools/eclipse/dot.classpath | 3 + solr/CHANGES.txt | 3 + solr/build.xml | 2 + solr/contrib/langid/CHANGES.txt | 15 + solr/contrib/langid/README.txt | 23 + solr/contrib/langid/build.xml | 33 ++ .../update/processor/DetectedLanguage.java | 47 +++ .../solr/update/processor/LangIdParams.java | 53 +++ .../LanguageIdentifierUpdateProcessor.java | 396 ++++++++++++++++++ ...guageIdentifierUpdateProcessorFactory.java | 94 +++++ .../test-files/langid/solr/conf/schema.xml | 81 ++++ .../conf/solrconfig-languageidentifier.xml | 81 ++++ ...eIdentifierUpdateProcessorFactoryTest.java | 216 ++++++++++ solr/example/solr/conf/solrconfig.xml | 23 + 14 files changed, 1070 insertions(+) create mode 100644 solr/contrib/langid/CHANGES.txt create mode 100644 solr/contrib/langid/README.txt create mode 100644 solr/contrib/langid/build.xml create mode 100644 solr/contrib/langid/src/java/org/apache/solr/update/processor/DetectedLanguage.java create mode 100644 solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java create mode 100644 solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java create mode 100644 solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactory.java create mode 100644 solr/contrib/langid/src/test-files/langid/solr/conf/schema.xml create mode 100644 solr/contrib/langid/src/test-files/langid/solr/conf/solrconfig-languageidentifier.xml create mode 100644 solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTest.java diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath index 01be1fdf4d0..087e24e9b99 100644 --- a/dev-tools/eclipse/dot.classpath +++ b/dev-tools/eclipse/dot.classpath @@ -71,6 +71,9 @@ + + + diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 1f12c38c16b..d1e8bb39b3a 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -354,6 +354,9 @@ New Features * SOLR-2769: Added factory for the new Hunspell stemmer capable of doing stemming for 99 languages (janhoy, cmale) +* SOLR-1979: New contrib "langid". Adds language identification capabilities as an + Update Processor, using Tika's LanguageIdentifier (janhoy, Tommaso Teofili, gsingers) + Bug Fixes ---------------------- * SOLR-2748: The CommitTracker used for commitWith or autoCommit by maxTime diff --git a/solr/build.xml b/solr/build.xml index 67fafaac8ec..20337723f15 100644 --- a/solr/build.xml +++ b/solr/build.xml @@ -456,12 +456,14 @@ + + diff --git a/solr/contrib/langid/CHANGES.txt b/solr/contrib/langid/CHANGES.txt new file mode 100644 index 00000000000..18ab4049be8 --- /dev/null +++ b/solr/contrib/langid/CHANGES.txt @@ -0,0 +1,15 @@ +Apache Solr Language Identifier + Release Notes + +This file describes changes to the SolrTika Language Identifier (contrib/langid) module. +See http://wiki.apache.org/solr/LanguageDetection for details + + +$Id$ + +================== Release 3.5.0 ================== + +Initial release. See README.txt. + +* SOLR-1979: New contrib "langid". Adds language identification capabilities as an + Update Processor, using Tika's LanguageIdentifier (janhoy, Tommaso Teofili, gsingers) diff --git a/solr/contrib/langid/README.txt b/solr/contrib/langid/README.txt new file mode 100644 index 00000000000..3d92fc3d910 --- /dev/null +++ b/solr/contrib/langid/README.txt @@ -0,0 +1,23 @@ +Apache Solr Language Identifier + + +Introduction +------------ +This module is intended to be used while indexing documents. +It is implemented as an UpdateProcessor to be placed in an UpdateChain. +Its purpose is to identify language from documents and tag the document with language code. +The module can optionally map field names to their language specific counterpart, +e.g. if the input is "title" and language is detected as "en", map to "title_en". +Language may be detected globally for the document, and/or individually per field. + +The module currently relies on Tika's language identification capabilities. + +Getting Started +--------------- +Please refer to the module documentation at http://wiki.apache.org/solr/LanguageDetection + +Dependencies +------------ +This contrib depends on Tika Core, which is part of extraction contrib. +The easiest is thus to first install extraction contrib and then langid. +Alternatively you can include tika-core manually on your classpath. \ No newline at end of file diff --git a/solr/contrib/langid/build.xml b/solr/contrib/langid/build.xml new file mode 100644 index 00000000000..158b3e4cff7 --- /dev/null +++ b/solr/contrib/langid/build.xml @@ -0,0 +1,33 @@ + + + + + + + + Language Identifier contrib for extracting language from a document being indexed + + + + + + + + + + diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/DetectedLanguage.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/DetectedLanguage.java new file mode 100644 index 00000000000..0c58a9673cd --- /dev/null +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/DetectedLanguage.java @@ -0,0 +1,47 @@ +package org.apache.solr.update.processor; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Bean holding a language and a detection certainty + */ +public class DetectedLanguage { + private final String langCode; + private final Double certainty; + + DetectedLanguage(String lang, Double certainty) { + this.langCode = lang; + this.certainty = certainty; + } + + /** + * Returns the detected language code + * @return language code as a string + */ + public String getLangCode() { + return langCode; + } + + /** + * Returns the detected certainty for this language + * @return certainty as a value between 0.0 and 1.0 where 1.0 is 100% certain + */ + public Double getCertainty() { + return certainty; + } +} diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java new file mode 100644 index 00000000000..e43b5043b9d --- /dev/null +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java @@ -0,0 +1,53 @@ +package org.apache.solr.update.processor; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public interface LangIdParams { + + String LANGUAGE_ID = "langid"; + String DOCID_PARAM = LANGUAGE_ID + ".idField"; + + String FIELDS_PARAM = LANGUAGE_ID + ".fl"; // Field list to detect from + String LANG_FIELD = LANGUAGE_ID + ".langField"; // Main language detected + String LANGS_FIELD = LANGUAGE_ID + ".langsField"; // All languages detected (multiValued) + String FALLBACK = LANGUAGE_ID + ".fallback"; // Fallback lang code + String FALLBACK_FIELDS = LANGUAGE_ID + ".fallbackFields"; // Comma-sep list of fallback fields + String OVERWRITE = LANGUAGE_ID + ".overwrite"; // Overwrite if existing language value in LANG_FIELD + String THRESHOLD = LANGUAGE_ID + ".threshold"; // Detection threshold + String ENFORCE_SCHEMA = LANGUAGE_ID + ".enforceSchema"; // Enforces that output fields exist in schema + String LANG_WHITELIST = LANGUAGE_ID + ".whitelist"; // Allowed languages + String MAP_ENABLE = LANGUAGE_ID + ".map"; // Turns on or off the field mapping + String MAP_FL = LANGUAGE_ID + ".map.fl"; // Field list for mapping + String MAP_OVERWRITE = LANGUAGE_ID + ".map.overwrite"; // Whether to overwrite existing fields + String MAP_KEEP_ORIG = LANGUAGE_ID + ".map.keepOrig"; // Keep original field after mapping + String MAP_INDIVIDUAL = LANGUAGE_ID + ".map.individual"; // Detect language per individual field + String MAP_INDIVIDUAL_FL = LANGUAGE_ID + ".map.individual.fl";// Field list of fields to redetect language for + String MAP_LCMAP = LANGUAGE_ID + ".map.lcmap"; // Enables mapping multiple langs to same output field + String MAP_PATTERN = LANGUAGE_ID + ".map.pattern"; // RegEx pattern to match field name + String MAP_REPLACE = LANGUAGE_ID + ".map.replace"; // Replace pattern + + String DOCID_FIELD_DEFAULT = "id"; + String DOCID_LANGFIELD_DEFAULT = null; + String DOCID_LANGSFIELD_DEFAULT = null; + String MAP_PATTERN_DEFAULT = "(.*)"; + String MAP_REPLACE_DEFAULT = "$1_{lang}"; + + // TODO: This default threshold accepts even "uncertain" detections. + // Increase &langid.threshold above 0.5 to return only certain detections + Double DOCID_THRESHOLD_DEFAULT = 0.5; +} diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java new file mode 100644 index 00000000000..ff5b937a529 --- /dev/null +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java @@ -0,0 +1,396 @@ +package org.apache.solr.update.processor; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.SolrInputField; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.tika.language.LanguageIdentifier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.regex.Pattern; + + +/** + * Identifies the language of a set of input fields using Tika's + * LanguageIdentifier. Also supports mapping of field names based + * on detected language. + * The tika-core-x.y.jar must be on the classpath + *

+ * See http://wiki.apache.org/solr/LanguageDetection + * @since 3.5 + */ +public class LanguageIdentifierUpdateProcessor extends UpdateRequestProcessor implements LangIdParams { + + protected final static Logger log = LoggerFactory + .getLogger(LanguageIdentifierUpdateProcessor.class); + + protected boolean enabled; + + protected String[] inputFields = {}; + protected String[] mapFields = {}; + protected Pattern mapPattern; + protected String mapReplaceStr; + protected String langField; + protected String langsField; // MultiValued, contains all languages detected + protected String docIdField; + protected String fallbackValue; + protected String[] fallbackFields = {}; + protected boolean enableMapping; + protected boolean mapKeepOrig; + protected boolean overwrite; + protected boolean mapOverwrite; + protected boolean mapIndividual; + protected boolean enforceSchema; + protected double threshold; + protected HashSet langWhitelist; + protected HashSet mapIndividualFieldsSet; + protected HashSet allMapFieldsSet; + protected HashMap lcMap; + protected IndexSchema schema; + + // Regex patterns + protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)"); + protected final Pattern langPattern = Pattern.compile("\\{lang\\}"); + + public LanguageIdentifierUpdateProcessor(SolrQueryRequest req, + SolrQueryResponse rsp, UpdateRequestProcessor next) { + super(next); + schema = req.getSchema(); + + initParams(req.getParams()); + } + + private void initParams(SolrParams params) { + if (params != null) { + // Document-centric langId params + setEnabled(params.getBool(LANGUAGE_ID, true)); + if(params.get(FIELDS_PARAM, "").length() > 0) { + inputFields = params.get(FIELDS_PARAM, "").split(","); + } + langField = params.get(LANG_FIELD, DOCID_LANGFIELD_DEFAULT); + langsField = params.get(LANGS_FIELD, DOCID_LANGSFIELD_DEFAULT); + docIdField = params.get(DOCID_PARAM, DOCID_FIELD_DEFAULT); + fallbackValue = params.get(FALLBACK); + if(params.get(FALLBACK_FIELDS, "").length() > 0) { + fallbackFields = params.get(FALLBACK_FIELDS).split(","); + } + overwrite = params.getBool(OVERWRITE, false); + langWhitelist = new HashSet(); + threshold = params.getDouble(THRESHOLD, DOCID_THRESHOLD_DEFAULT); + if(params.get(LANG_WHITELIST, "").length() > 0) { + for(String lang : params.get(LANG_WHITELIST, "").split(",")) { + langWhitelist.add(lang); + } + } + + // Mapping params (field centric) + enableMapping = params.getBool(MAP_ENABLE, false); + if(params.get(MAP_FL, "").length() > 0) { + mapFields = params.get(MAP_FL, "").split(","); + } else { + mapFields = inputFields; + } + mapKeepOrig = params.getBool(MAP_KEEP_ORIG, false); + mapOverwrite = params.getBool(MAP_OVERWRITE, false); + mapIndividual = params.getBool(MAP_INDIVIDUAL, false); + + // Process individual fields + String[] mapIndividualFields = {}; + if(params.get(MAP_INDIVIDUAL_FL, "").length() > 0) { + mapIndividualFields = params.get(MAP_INDIVIDUAL_FL, "").split(","); + } else { + mapIndividualFields = mapFields; + } + mapIndividualFieldsSet = new HashSet(Arrays.asList(mapIndividualFields)); + // Compile a union of the lists of fields to map + allMapFieldsSet = new HashSet(Arrays.asList(mapFields)); + if(Arrays.equals(mapFields, mapIndividualFields)) { + allMapFieldsSet.addAll(mapIndividualFieldsSet); + } + + // Language Code mapping + lcMap = new HashMap(); + if(params.get(MAP_LCMAP) != null) { + for(String mapping : params.get(MAP_LCMAP).split("[, ]")) { + String[] keyVal = mapping.split(":"); + if(keyVal.length == 2) { + lcMap.put(keyVal[0], keyVal[1]); + } else { + log.error("Unsupported format for langid.map.lcmap: "+mapping+". Skipping this mapping."); + } + } + } + enforceSchema = params.getBool(ENFORCE_SCHEMA, true); + + mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT)); + mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT); + + + } + log.debug("LangId configured"); + + + if (inputFields.length == 0) { + throw new SolrException(ErrorCode.BAD_REQUEST, + "Missing or faulty configuration of LanguageIdentifierUpdateProcessor. Input fields must be specified as a comma separated list"); + } + + } + + @Override + public void processAdd(AddUpdateCommand cmd) throws IOException { + if (isEnabled()) { + process(cmd.getSolrInputDocument()); + } else { + log.debug("Processor not enabled, not running"); + } + super.processAdd(cmd); + } + + /** + * This is the main, testable process method called from processAdd() + * @param doc the SolrInputDocument to work on + * @return the modified SolrInputDocument + */ + protected SolrInputDocument process(SolrInputDocument doc) { + String docLang = null; + HashSet docLangs = new HashSet(); + String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue); + + if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) { + String allText = concatFields(doc, inputFields); + List languagelist = detectLanguage(allText); + docLang = resolveLanguage(languagelist, fallbackLang); + docLangs.add(docLang); + log.debug("Detected main document language from fields "+inputFields+": "+docLang); + + if(doc.containsKey(langField) && overwrite) { + log.debug("Overwritten old value "+doc.getFieldValue(langField)); + } + if(langField != null && langField.length() != 0) { + doc.setField(langField, docLang); + } + } else { + // langField is set, we sanity check it against whitelist and fallback + docLang = resolveLanguage((String) doc.getFieldValue(langField), fallbackLang); + docLangs.add(docLang); + log.debug("Field "+langField+" already contained value "+docLang+", not overwriting."); + } + + if(enableMapping) { + for (String fieldName : allMapFieldsSet) { + if(doc.containsKey(fieldName)) { + String fieldLang; + if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) { + String text = (String) doc.getFieldValue(fieldName); + List languagelist = detectLanguage(text); + fieldLang = resolveLanguage(languagelist, docLang); + docLangs.add(fieldLang); + log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang); + } else { + fieldLang = docLang; + log.debug("Mapping field "+fieldName+" using document global language "+fieldLang); + } + String mappedOutputField = getMappedField(fieldName, fieldLang); + if(enforceSchema && schema.getFieldOrNull(fieldName) == null) { + log.warn("Unsuccessful field name mapping to {}, field does not exist, skipping mapping.", mappedOutputField, fieldName); + mappedOutputField = fieldName; + } + + if (mappedOutputField != null) { + log.debug("Mapping field {} to {}", doc.getFieldValue(docIdField), fieldLang); + SolrInputField inField = doc.getField(fieldName); + doc.setField(mappedOutputField, inField.getValue(), inField.getBoost()); + if(!mapKeepOrig) { + log.debug("Removing old field {}", fieldName); + doc.removeField(fieldName); + } + } else { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Invalid output field mapping for " + + fieldName + " field and language: " + fieldLang); + } + } else { + log.warn("Document {} does not contain input field {}. Skipping this field.", doc.getFieldValue(docIdField), fieldName); + } + } + } + + // Set the languages field to an array of all detected languages + if(langsField != null && langsField.length() != 0) { + doc.setField(langsField, docLangs.toArray()); + } + + return doc; + } + + /** + * Decides the fallback language, either from content of fallback field or fallback value + * @param doc the Solr document + * @param fallbackFields an array of strings with field names containing fallback language codes + * @param fallbackValue a language code to use in case no fallbackFields are found + */ + private String getFallbackLang(SolrInputDocument doc, String[] fallbackFields, String fallbackValue) { + String lang = null; + for(String field : fallbackFields) { + if(doc.containsKey(field)) { + lang = (String) doc.getFieldValue(field); + log.debug("Language fallback to field "+field); + break; + } + } + if(lang == null) { + log.debug("Language fallback to value "+fallbackValue); + lang = fallbackValue; + } + return lang; + } + + /* + * Concatenates content from multiple fields + */ + protected String concatFields(SolrInputDocument doc, String[] fields) { + StringBuffer sb = new StringBuffer(); + for (String fieldName : inputFields) { + log.debug("Appending field "+fieldName); + if (doc.containsKey(fieldName)) { + Object content = doc.getFieldValue(fieldName); + if(content instanceof String) { + sb.append((String) doc.getFieldValue(fieldName)); + sb.append(" "); + } else { + log.warn("Field "+fieldName+" not a String value, not including in detection"); + } + } + } + return sb.toString(); + } + + /** + * Detects language(s) from a string. + * Classes wishing to implement their own language detection module should override this method. + * @param content The content to identify + * @return List of detected language(s) according to RFC-3066 + */ + protected List detectLanguage(String content) { + List languages = new ArrayList(); + if(content.trim().length() != 0) { + LanguageIdentifier identifier = new LanguageIdentifier(content.toString()); + // FIXME: Hack - we get the distance from toString and calculate our own certainty score + Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1")); + // This formula gives: 0.02 => 0.8, 0.1 => 0.5 which is a better sweetspot than isReasonablyCertain() + Double certainty = 1 - (5 * distance); + certainty = (certainty < 0) ? 0 : certainty; + DetectedLanguage language = new DetectedLanguage(identifier.getLanguage(), certainty); + languages.add(language); + log.debug("Language detected as "+language+" with a certainty of "+language.getCertainty()+" (Tika distance="+identifier.toString()+")"); + } else { + log.debug("No input text to detect language from, returning empty list"); + } + return languages; + } + + /** + * Chooses a language based on the list of candidates detected + * @param language language code as a string + * @param fallbackLang the language code to use as a fallback + * @return a string of the chosen language + */ + protected String resolveLanguage(String language, String fallbackLang) { + List l = new ArrayList(); + l.add(new DetectedLanguage(language, 1.0)); + return resolveLanguage(l, fallbackLang); + } + + /** + * Chooses a language based on the list of candidates detected + * @param languages a List of DetectedLanguages with certainty score + * @param fallbackLang the language code to use as a fallback + * @return a string of the chosen language + */ + protected String resolveLanguage(List languages, String fallbackLang) { + String langStr; + if(languages.size() == 0) { + log.debug("No language detected, using fallback {}", fallbackLang); + langStr = fallbackLang; + } else { + DetectedLanguage lang = languages.get(0); + if(langWhitelist.isEmpty() || langWhitelist.contains(lang.getLangCode())) { + log.debug("Language detected {} with certainty {}", lang.getLangCode(), lang.getCertainty()); + if(lang.getCertainty() >= threshold) { + langStr = lang.getLangCode(); + } else { + log.debug("Detected language below threshold {}, using fallback {}", threshold, fallbackLang); + langStr = fallbackLang; + } + } else { + log.debug("Detected a language not in whitelist ({}), using fallback {}", lang.getLangCode(), fallbackLang); + langStr = fallbackLang; + } + } + + if(langStr == null || langStr.length() == 0) { + log.warn("Language resolved to null or empty string. Fallback not configured?"); + langStr = ""; + } + + return langStr; + } + + /** + * Returns the name of the field to map the current contents into, so that they are properly analyzed. For instance + * if the currentField is "text" and the code is "en", the new field would be "text_en". If such a field doesn't exist, + * then null is returned. + * + * @param currentField The current field name + * @param language the language code + * @return The new schema field name, based on pattern and replace + */ + protected String getMappedField(String currentField, String language) { + String lc = lcMap.containsKey(language) ? lcMap.get(language) : language; + String newFieldName = langPattern.matcher(mapPattern.matcher(currentField).replaceFirst(mapReplaceStr)).replaceFirst(lc); + log.debug("Doing mapping from "+currentField+" with language "+language+" to field "+newFieldName); + return newFieldName; + } + + /** + * Tells if this processor is enabled or not + * @return true if enabled, else false + */ + public boolean isEnabled() { + return enabled; + } + + public void setEnabled(boolean enabled) { + this.enabled = enabled; + } + +} diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactory.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactory.java new file mode 100644 index 00000000000..43c94f4bced --- /dev/null +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactory.java @@ -0,0 +1,94 @@ +package org.apache.solr.update.processor; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.util.SolrPluginUtils; +import org.apache.solr.util.plugin.SolrCoreAware; + +/** + * Identifies the language of a set of input fields using Tika's + * LanguageIdentifier. The tika-core-x.y.jar must be on the classpath + *

+ * The UpdateProcessorChain config entry can take a number of parameters + * which may also be passed as HTTP parameters on the update request + * and override the defaults. Here is the simplest processor config possible: + * + *

+ * <processor class="org.apache.solr.update.processor.LanguageIdentifierUpdateProcessorFactory">
+ *   <str name="langid.fl">title,text</str>
+ *   <str name="langid.langField">language_s</str>
+ * </processor>
+ * 
+ * See http://wiki.apache.org/solr/LanguageDetection + * @since 3.5 + */ +public class LanguageIdentifierUpdateProcessorFactory extends + UpdateRequestProcessorFactory implements SolrCoreAware, LangIdParams { + + protected SolrParams defaults; + protected SolrParams appends; + protected SolrParams invariants; + + @Override + public void inform(SolrCore core) { + } + + /** + * The UpdateRequestProcessor may be initialized in solrconfig.xml similarly + * to a RequestHandler, with defaults, appends and invariants. + * @param args a NamedList with the configuration parameters + */ + @SuppressWarnings("rawtypes") + public void init( NamedList args ) + { + if (args != null) { + Object o; + o = args.get("defaults"); + if (o != null && o instanceof NamedList) { + defaults = SolrParams.toSolrParams((NamedList) o); + } else { + defaults = SolrParams.toSolrParams(args); + } + o = args.get("appends"); + if (o != null && o instanceof NamedList) { + appends = SolrParams.toSolrParams((NamedList) o); + } + o = args.get("invariants"); + if (o != null && o instanceof NamedList) { + invariants = SolrParams.toSolrParams((NamedList) o); + } + } + } + + @Override + public UpdateRequestProcessor getInstance(SolrQueryRequest req, + SolrQueryResponse rsp, UpdateRequestProcessor next) { + // Process defaults, appends and invariants if we got a request + if(req != null) { + SolrPluginUtils.setDefaults(req, defaults, appends, invariants); + } + return new LanguageIdentifierUpdateProcessor(req, rsp, next); + } + + +} diff --git a/solr/contrib/langid/src/test-files/langid/solr/conf/schema.xml b/solr/contrib/langid/src/test-files/langid/solr/conf/schema.xml new file mode 100644 index 00000000000..ddb7c2481d3 --- /dev/null +++ b/solr/contrib/langid/src/test-files/langid/solr/conf/schema.xml @@ -0,0 +1,81 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + name + id + + + diff --git a/solr/contrib/langid/src/test-files/langid/solr/conf/solrconfig-languageidentifier.xml b/solr/contrib/langid/src/test-files/langid/solr/conf/solrconfig-languageidentifier.xml new file mode 100644 index 00000000000..f3f97d4ce8a --- /dev/null +++ b/solr/contrib/langid/src/test-files/langid/solr/conf/solrconfig-languageidentifier.xml @@ -0,0 +1,81 @@ + + + + + + + + + + ${solr.data.dir:./solr/data} + + + + + LUCENE_40 + + + + explicit + 10 + + + + + true + + + + + + + + + + max-age=30, public + + + + + + lang_id + + + + + + + + true + name,subject + true + language_s + language_sm + th:thai + 0.5 + fallback + + + + + + diff --git a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTest.java b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTest.java new file mode 100644 index 00000000000..c0a886d70a9 --- /dev/null +++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTest.java @@ -0,0 +1,216 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.core.SolrCore; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.servlet.SolrRequestParsers; + +public class LanguageIdentifierUpdateProcessorFactoryTest extends SolrTestCaseJ4 { + + protected static SolrRequestParsers _parser; + protected static SolrQueryRequest req; + protected static SolrQueryResponse resp = new SolrQueryResponse(); + protected static LanguageIdentifierUpdateProcessor liProcessor; + protected static ModifiableSolrParams parameters; + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-languageidentifier.xml", "schema.xml", getFile("langid/solr").getAbsolutePath()); + SolrCore core = h.getCore(); + UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("lang_id"); + assertNotNull(chained); + _parser = new SolrRequestParsers(null); + } + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + clearIndex(); + assertU(commit()); + } + + @Test + public void testLangIdGlobal() throws Exception { + parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "name,subject"); + parameters.add("langid.langField", "language_s"); + parameters.add("langid.fallback", "un"); + liProcessor = createLangIdProcessor(parameters); + + assertLang("no", "id", "1no", "name", "Lucene", "subject", "Lucene er et fri/åpen kildekode programvarebibliotek for informasjonsgjenfinning, opprinnelig utviklet i programmeringsspråket Java av Doug Cutting. Lucene støttes av Apache Software Foundation og utgis under Apache-lisensen."); + assertLang("en", "id", "2en", "name", "Lucene", "subject", "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License."); + assertLang("sv", "id", "3sv", "name", "Maven", "subject", "Apache Maven är ett verktyg utvecklat av Apache Software Foundation och används inom systemutveckling av datorprogram i programspråket Java. Maven används för att automatiskt paketera (bygga) programfilerna till en distribuerbar enhet. Maven används inom samma område som Apache Ant men dess byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade."); + assertLang("es", "id", "4es", "name", "Lucene", "subject", "Lucene es un API de código abierto para recuperación de información, originalmente implementada en Java por Doug Cutting. Está apoyado por el Apache Software Foundation y se distribuye bajo la Apache Software License. Lucene tiene versiones para otros lenguajes incluyendo Delphi, Perl, C#, C++, Python, Ruby y PHP."); + assertLang("un", "id", "5un", "name", "a", "subject", "b"); + assertLang("th", "id", "6th", "name", "บทความคัดสรรเดือนนี้", "subject", "อันเนอลีส มารี อันเนอ ฟรังค์ หรือมักรู้จักในภาษาไทยว่า แอนน์ แฟรงค์ เป็นเด็กหญิงชาวยิว เกิดที่เมืองแฟรงก์เฟิร์ต ประเทศเยอรมนี เธอมีชื่อเสียงโด่งดังในฐานะผู้เขียนบันทึกประจำวันซึ่งต่อมาได้รับการตีพิมพ์เป็นหนังสือ บรรยายเหตุการณ์ขณะหลบซ่อนตัวจากการล่าชาวยิวในประเทศเนเธอร์แลนด์ ระหว่างที่ถูกเยอรมนีเข้าครอบครองในช่วงสงครามโลกครั้งที่สอง"); + assertLang("ru", "id", "7ru", "name", "Lucene", "subject", "The Apache Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска, написанная на Java. Может быть использована для поиска в интернете и других областях компьютерной лингвистики (аналитическая философия)."); + assertLang("de", "id", "8de", "name", "Lucene", "subject", "Lucene ist ein Freie-Software-Projekt der Apache Software Foundation, das eine Suchsoftware erstellt. Durch die hohe Leistungsfähigkeit und Skalierbarkeit können die Lucene-Werkzeuge für beliebige Projektgrößen und Anforderungen eingesetzt werden. So setzt beispielsweise Wikipedia Lucene für die Volltextsuche ein. Zudem verwenden die beiden Desktop-Suchprogramme Beagle und Strigi eine C#- bzw. C++- Portierung von Lucene als Indexer."); + assertLang("fr", "id", "9fr", "name", "Lucene", "subject", "Lucene est un moteur de recherche libre écrit en Java qui permet d'indexer et de rechercher du texte. C'est un projet open source de la fondation Apache mis à disposition sous licence Apache. Il est également disponible pour les langages Ruby, Perl, C++, PHP."); + assertLang("nl", "id", "10nl", "name", "Lucene", "subject", "Lucene is een gratis open source, tekst gebaseerde information retrieval API van origine geschreven in Java door Doug Cutting. Het wordt ondersteund door de Apache Software Foundation en is vrijgegeven onder de Apache Software Licentie. Lucene is ook beschikbaar in andere programeertalen zoals Perl, C#, C++, Python, Ruby en PHP."); + assertLang("it", "id", "11it", "name", "Lucene", "subject", "Lucene è una API gratuita ed open source per il reperimento di informazioni inizialmente implementata in Java da Doug Cutting. È supportata dall'Apache Software Foundation ed è resa disponibile con l'Apache License. Lucene è stata successivamente reimplementata in Perl, C#, C++, Python, Ruby e PHP."); + assertLang("pt", "id", "12pt", "name", "Lucene", "subject", "Apache Lucene, ou simplesmente Lucene, é um software de busca e uma API de indexação de documentos, escrito na linguagem de programação Java. É um software de código aberto da Apache Software Foundation licenciado através da licença Apache."); + } + + @Test + public void testMapFieldName() throws Exception { + parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "name"); + parameters.add("langid.map.lcmap", "jp:s zh:cjk ko:cjk"); + parameters.add("langid.enforceSchema", "true"); + liProcessor = createLangIdProcessor(parameters); + + assertEquals("test_no", liProcessor.getMappedField("test", "no")); + assertEquals("test_en", liProcessor.getMappedField("test", "en")); + assertEquals("test_s", liProcessor.getMappedField("test", "jp")); + assertEquals("test_cjk", liProcessor.getMappedField("test", "zh")); + assertEquals("test_cjk", liProcessor.getMappedField("test", "ko")); + + // Prove support for other mapping regex + parameters.add("langid.map.pattern", "text_(.*?)_field"); + parameters.add("langid.map.replace", "$1_{lang}Text"); + liProcessor = createLangIdProcessor(parameters); + + assertEquals("title_noText", liProcessor.getMappedField("text_title_field", "no")); + assertEquals("body_svText", liProcessor.getMappedField("text_body_field", "sv")); + } + + @Test + public void testPreExisting() throws Exception { + SolrInputDocument doc; + parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "text"); + parameters.add("langid.langField", "language"); + parameters.add("langid.langsField", "languages"); + parameters.add("langid.enforceSchema", "false"); + parameters.add("langid.map", "true"); + liProcessor = createLangIdProcessor(parameters); + + doc = englishDoc(); + assertEquals("en", liProcessor.process(doc).getFieldValue("language")); + assertEquals("en", liProcessor.process(doc).getFieldValue("languages")); + + doc = englishDoc(); + doc.setField("language", "no"); + assertEquals("no", liProcessor.process(doc).getFieldValue("language")); + assertEquals("no", liProcessor.process(doc).getFieldValue("languages")); + assertNotNull(liProcessor.process(doc).getFieldValue("text_no")); + } + + @Test + public void testDefaultFallbackEmptyString() throws Exception { + SolrInputDocument doc; + parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "text"); + parameters.add("langid.langField", "language"); + parameters.add("langid.enforceSchema", "false"); + liProcessor = createLangIdProcessor(parameters); + + doc = tooShortDoc(); + assertEquals("", liProcessor.process(doc).getFieldValue("language")); + } + + @Test + public void testFallback() throws Exception { + SolrInputDocument doc; + parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "text"); + parameters.add("langid.langField", "language"); + parameters.add("langid.fallbackFields", "noop,fb"); + parameters.add("langid.fallback", "fbVal"); + parameters.add("langid.enforceSchema", "false"); + liProcessor = createLangIdProcessor(parameters); + + // Verify fallback to field fb (noop field does not exist and is skipped) + doc = tooShortDoc(); + doc.addField("fb", "fbField"); + assertEquals("fbField", liProcessor.process(doc).getFieldValue("language")); + + // Verify fallback to fallback value since no fallback fields exist + doc = tooShortDoc(); + assertEquals("fbVal", liProcessor.process(doc).getFieldValue("language")); + } + + @Test + public void testResolveLanguage() throws Exception { + List langs; + parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "text"); + parameters.add("langid.langField", "language"); + liProcessor = createLangIdProcessor(parameters); + + // No detected languages + langs = new ArrayList(); + assertEquals("", liProcessor.resolveLanguage(langs, null)); + assertEquals("fallback", liProcessor.resolveLanguage(langs, "fallback")); + + // One detected language + langs.add(new DetectedLanguage("one", 1.0)); + assertEquals("one", liProcessor.resolveLanguage(langs, "fallback")); + + // One detected language under default threshold + langs = new ArrayList(); + langs.add(new DetectedLanguage("under", 0.1)); + assertEquals("fallback", liProcessor.resolveLanguage(langs, "fallback")); + } + + + // Various utility methods + + private SolrInputDocument englishDoc() { + SolrInputDocument doc = new SolrInputDocument(); + doc.addField("text", "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License."); + return doc; + } + + private SolrInputDocument tooShortDoc() { + SolrInputDocument doc = new SolrInputDocument(); + doc.addField("text", "This text is too short"); + return doc; + } + + private LanguageIdentifierUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception { + return new LanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(null, parameters, null), resp, null); + } + + private void assertLang(String langCode, String... fieldsAndValues) throws Exception { + if(liProcessor == null) + throw new Exception("Processor must be initialized before calling assertLang()"); + SolrInputDocument doc = sid(fieldsAndValues); + assertEquals(langCode, liProcessor.process(doc).getFieldValue(liProcessor.langField)); + } + + private SolrInputDocument sid(String... fieldsAndValues) { + SolrInputDocument doc = new SolrInputDocument(); + for (int i = 0; i < fieldsAndValues.length; i+=2) { + doc.addField(fieldsAndValues[i], fieldsAndValues[i+1]); + } + return doc; + } +} diff --git a/solr/example/solr/conf/solrconfig.xml b/solr/example/solr/conf/solrconfig.xml index 7c5823ae9dd..ba83ee680d0 100755 --- a/solr/example/solr/conf/solrconfig.xml +++ b/solr/example/solr/conf/solrconfig.xml @@ -62,6 +62,7 @@ --> + @@ -1527,7 +1528,29 @@ --> + + + +