diff --git a/dev-tools/idea/solr/contrib/langid/langid.iml b/dev-tools/idea/solr/contrib/langid/langid.iml index 28223bd1352..afeb1255819 100644 --- a/dev-tools/idea/solr/contrib/langid/langid.iml +++ b/dev-tools/idea/solr/contrib/langid/langid.iml @@ -31,5 +31,6 @@ + diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 83767941a4c..2179602226c 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -94,6 +94,8 @@ New Features * SOLR-11810: Upgrade Jetty to 9.4.8.v20171121 (Varun Thacker, Erick Erickson) +* SOLR-11592: Add OpenNLP language detection to the langid contrib. (Koji, Steve Rowe) + Bug Fixes ---------------------- diff --git a/solr/contrib/langid/README.txt b/solr/contrib/langid/README.txt index 2e6cd54d4c6..68a2ea58c39 100644 --- a/solr/contrib/langid/README.txt +++ b/solr/contrib/langid/README.txt @@ -18,4 +18,5 @@ Please refer to the module documentation at http://wiki.apache.org/solr/Language Dependencies ------------ The Tika detector depends on Tika Core (which is part of extraction contrib) -The Langdetect detector depends on LangDetect library \ No newline at end of file +The Langdetect detector depends on LangDetect library +The OpenNLP detector depends on OpenNLP tools and requires a previously trained user-supplied model diff --git a/solr/contrib/langid/build.xml b/solr/contrib/langid/build.xml index 8341a763354..aca7aebb1cb 100644 --- a/solr/contrib/langid/build.xml +++ b/solr/contrib/langid/build.xml @@ -25,6 +25,17 @@ + + + + + + + + + + + @@ -39,4 +50,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/contrib/langid/ivy.xml b/solr/contrib/langid/ivy.xml index 88dc62830ff..04c6b253e5b 100644 --- a/solr/contrib/langid/ivy.xml +++ b/solr/contrib/langid/ivy.xml @@ -25,6 +25,7 @@ + diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java index a8d6523bbe8..3679905dd79 100644 --- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java @@ -33,6 +33,7 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -399,4 +400,67 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro this.enabled = enabled; } + + + /** + * Concatenates content from multiple fields + */ + protected String concatFields(SolrInputDocument doc) { + StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields)); + for (String fieldName : inputFields) { + log.debug("Appending field " + fieldName); + if (doc.containsKey(fieldName)) { + Collection fieldValues = doc.getFieldValues(fieldName); + if (fieldValues != null) { + for (Object content : fieldValues) { + if (content instanceof String) { + String stringContent = (String) content; + if (stringContent.length() > maxFieldValueChars) { + sb.append(stringContent.substring(0, maxFieldValueChars)); + } else { + sb.append(stringContent); + } + sb.append(" "); + if (sb.length() > maxTotalChars) { + sb.setLength(maxTotalChars); + break; + } + } else { + log.warn("Field " + fieldName + " not a String value, not including in detection"); + } + } + } + } + } + return sb.toString(); + } + + /** + * Calculate expected string size. + * + * @param doc solr input document + * @param fields fields to select + * @return expected size of string value + */ + private int getExpectedSize(SolrInputDocument doc, String[] fields) { + int docSize = 0; + for (String field : fields) { + if (doc.containsKey(field)) { + Collection contents = doc.getFieldValues(field); + if (contents != null) { + for (Object content : contents) { + if (content instanceof String) { + docSize += Math.min(((String) content).length(), maxFieldValueChars); + } + } + + if (docSize > maxTotalChars) { + docSize = maxTotalChars; + break; + } + } + } + } + return docSize; + } } diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java new file mode 100644 index 00000000000..83f4fe4cdaf --- /dev/null +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import opennlp.tools.langdetect.Language; +import opennlp.tools.langdetect.LanguageDetectorME; +import opennlp.tools.langdetect.LanguageDetectorModel; + +/** + * Identifies the language of a set of input fields using Apache OpenNLP. + * + * See "Language Detector" section of + * https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html + */ +public class OpenNLPLangDetectUpdateProcessor extends LanguageIdentifierUpdateProcessor { + + private final LanguageDetectorModel model; + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + /** Maps ISO 639-3 (3-letter language code) to ISO 639-1 (2-letter language code) */ + private static final Map ISO639_MAP = make_ISO639_map(); + + public OpenNLPLangDetectUpdateProcessor(SolrQueryRequest req, SolrQueryResponse rsp, + UpdateRequestProcessor next, LanguageDetectorModel model) { + super(req, rsp, next); + this.model = model; + } + + @Override + protected List detectLanguage(SolrInputDocument doc) { + List languages = new ArrayList<>(); + String content = concatFields(doc); + if (content.length() != 0) { + LanguageDetectorME ldme = new LanguageDetectorME(model); + Language[] langs = ldme.predictLanguages(content); + for(Language language: langs){ + languages.add(new DetectedLanguage(ISO639_MAP.get(language.getLang()), language.getConfidence())); + } + } else { + log.debug("No input text to detect language from, returning empty list"); + } + return languages; + } + + private static Map make_ISO639_map() { + Map map = new HashMap<>(); + for (String lang : Locale.getISOLanguages()) { + Locale locale = new Locale(lang); + map.put(locale.getISO3Language(), locale.getLanguage()); + } + return map; + } +} diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java new file mode 100644 index 00000000000..dfbdcbdc51e --- /dev/null +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.core.SolrResourceLoader; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.util.SolrPluginUtils; +import org.apache.solr.util.plugin.SolrCoreAware; + +import opennlp.tools.langdetect.LanguageDetectorModel; + +/** + * Identifies the language of a set of input fields using Apache OpenNLP. + * + * The UpdateProcessorChain config entry can take a number of parameters + * which may also be passed as HTTP parameters on the update request + * and override the defaults. Here is the simplest processor config possible: + * + * + * <processor class="org.apache.solr.update.processor.OpenNLPLangDetectUpdateProcessorFactory"> + * <str name="langid.fl">title,text</str> + * <str name="langid.langField">language_s</str> + * <str name="langid.model">langdetect-183.bin</str> + * </processor> + * + * See http://wiki.apache.org/solr/LanguageDetection + */ +public class OpenNLPLangDetectUpdateProcessorFactory extends UpdateRequestProcessorFactory + implements SolrCoreAware { + + private static final String MODEL_PARAM = "langid.model"; + private String modelFile; + private LanguageDetectorModel model; + protected SolrParams defaults; + protected SolrParams appends; + protected SolrParams invariants; + private SolrResourceLoader solrResourceLoader; + + @Override + public void init( NamedList args ) + { + if (args != null) { + Object o; + o = args.get("defaults"); + if (o != null && o instanceof NamedList) { + defaults = SolrParams.toSolrParams((NamedList) o); + } else { + defaults = SolrParams.toSolrParams(args); + } + o = args.get("appends"); + if (o != null && o instanceof NamedList) { + appends = SolrParams.toSolrParams((NamedList) o); + } + o = args.get("invariants"); + if (o != null && o instanceof NamedList) { + invariants = SolrParams.toSolrParams((NamedList) o); + } + + // Look for model filename in invariants, then in args, then defaults + if (invariants != null) { + modelFile = invariants.get(MODEL_PARAM); + } + if (modelFile == null) { + o = args.get(MODEL_PARAM); + if (o != null && o instanceof String) { + modelFile = (String)o; + } else { + modelFile = defaults.get(MODEL_PARAM); + if (modelFile == null) { + throw new RuntimeException("Couldn't load language model, will return empty languages always!"); + } + } + } + } + } + + @Override + public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + // Process defaults, appends and invariants if we got a request + if (req != null) { + SolrPluginUtils.setDefaults(req, defaults, appends, invariants); + } + return new OpenNLPLangDetectUpdateProcessor(req, rsp, next, model); + } + + private void loadModel() throws IOException { + InputStream is = null; + try{ + if (modelFile != null) { + is = solrResourceLoader.openResource(modelFile); + model = new LanguageDetectorModel(is); + } + } + finally{ + IOUtils.closeQuietly(is); + } + } + + @Override + public void inform(SolrCore core){ + solrResourceLoader = core.getResourceLoader(); + try { + loadModel(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java index df0e5f7fa25..5c8146d1db5 100644 --- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java @@ -28,8 +28,6 @@ import org.apache.solr.common.SolrInputDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Collection; - /** * Identifies the language of a set of input fields using Tika's * LanguageIdentifier. @@ -67,67 +65,4 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd } return languages; } - - - /** - * Concatenates content from multiple fields - */ - protected String concatFields(SolrInputDocument doc) { - StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields)); - for (String fieldName : inputFields) { - log.debug("Appending field " + fieldName); - if (doc.containsKey(fieldName)) { - Collection fieldValues = doc.getFieldValues(fieldName); - if (fieldValues != null) { - for (Object content : fieldValues) { - if (content instanceof String) { - String stringContent = (String) content; - if (stringContent.length() > maxFieldValueChars) { - sb.append(stringContent.substring(0, maxFieldValueChars)); - } else { - sb.append(stringContent); -} - sb.append(" "); - if (sb.length() > maxTotalChars) { - sb.setLength(maxTotalChars); - break; - } - } else { - log.warn("Field " + fieldName + " not a String value, not including in detection"); - } - } - } - } - } - return sb.toString(); - } - - /** - * Calculate expected string size. - * - * @param doc solr input document - * @param fields fields to select - * @return expected size of string value - */ - private int getExpectedSize(SolrInputDocument doc, String[] fields) { - int docSize = 0; - for (String field : fields) { - if (doc.containsKey(field)) { - Collection contents = doc.getFieldValues(field); - if (contents != null) { - for (Object content : contents) { - if (content instanceof String) { - docSize += Math.min(((String) content).length(), maxFieldValueChars); - } - } - - if (docSize > maxTotalChars) { - docSize = maxTotalChars; - break; - } - } - } - } - return docSize; - } } diff --git a/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/opennlp-langdetect.eng-swe-spa-rus-deu.bin b/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/opennlp-langdetect.eng-swe-spa-rus-deu.bin new file mode 100644 index 00000000000..ad584e65be0 Binary files /dev/null and b/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/opennlp-langdetect.eng-swe-spa-rus-deu.bin differ diff --git a/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml b/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml index 9ae54adc148..01dbee9aaea 100644 --- a/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml +++ b/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml @@ -57,11 +57,11 @@ - lang_id + lang_id_tika - + @@ -78,7 +78,7 @@ - + @@ -94,5 +94,22 @@ - + + + + + + true + name,subject + true + language_s + language_sm + th:thai + 0.3 + opennlp-langdetect.eng-swe-spa-rus-deu.bin + + + + + diff --git a/solr/contrib/langid/src/test-files/opennlp.langdetect.trainer.params.txt b/solr/contrib/langid/src/test-files/opennlp.langdetect.trainer.params.txt new file mode 100644 index 00000000000..1ecec823ff3 --- /dev/null +++ b/solr/contrib/langid/src/test-files/opennlp.langdetect.trainer.params.txt @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +Algorithm=PERCEPTRON +Cutoff=0 diff --git a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java index b90f54a4d3f..21ecd7d6a08 100644 --- a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java +++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java @@ -38,7 +38,11 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S public static void beforeClass() throws Exception { initCore("solrconfig-languageidentifier.xml", "schema.xml", getFile("langid/solr").getAbsolutePath()); SolrCore core = h.getCore(); - UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("lang_id"); + UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("lang_id_tika"); + assertNotNull(chained); + chained = core.getUpdateProcessingChain("lang_id_lang_detect"); + assertNotNull(chained); + chained = core.getUpdateProcessingChain("lang_id_opennlp"); assertNotNull(chained); } diff --git a/solr/contrib/langid/src/test/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactoryTest.java b/solr/contrib/langid/src/test/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactoryTest.java new file mode 100644 index 00000000000..7b95e6f4ce8 --- /dev/null +++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactoryTest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.junit.Test; + +public class OpenNLPLangDetectUpdateProcessorFactoryTest extends LanguageIdentifierUpdateProcessorFactoryTestCase { + private static final String TEST_MODEL = "opennlp-langdetect.eng-swe-spa-rus-deu.bin"; + + @Override + protected OpenNLPLangDetectUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception { + if (parameters.get("langid.model") == null) { // handle superclass tests that don't provide the model filename + parameters.set("langid.model", TEST_MODEL); + } + if (parameters.get("langid.threshold") == null) { // handle superclass tests that don't provide confidence threshold + parameters.set("langid.threshold", "0.3"); + } + SolrQueryRequest req = _parser.buildRequestFrom(h.getCore(), new ModifiableSolrParams(), null); + OpenNLPLangDetectUpdateProcessorFactory factory = new OpenNLPLangDetectUpdateProcessorFactory(); + factory.init(parameters.toNamedList()); + factory.inform(h.getCore()); + return (OpenNLPLangDetectUpdateProcessor)factory.getInstance(req, resp, null); + } + + // this one actually works better it seems with short docs + @Override + protected SolrInputDocument tooShortDoc() { + SolrInputDocument doc = new SolrInputDocument(); + doc.addField("text", ""); + return doc; + } + + @Test @Override + public void testLangIdGlobal() throws Exception { + ModifiableSolrParams parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "name,subject"); + parameters.add("langid.langField", "language_s"); + parameters.add("langid.model", TEST_MODEL); + parameters.add("langid.threshold", "0.3"); + liProcessor = createLangIdProcessor(parameters); + + assertLang("en", "id", "1en", "name", "Lucene", "subject", "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License."); + assertLang("sv", "id", "2sv", "name", "Maven", "subject", "Apache Maven är ett verktyg utvecklat av Apache Software Foundation och används inom systemutveckling av datorprogram i programspråket Java. Maven används för att automatiskt paketera (bygga) programfilerna till en distribuerbar enhet. Maven används inom samma område som Apache Ant men dess byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade."); + assertLang("es", "id", "3es", "name", "Lucene", "subject", "Lucene es un API de código abierto para recuperación de información, originalmente implementada en Java por Doug Cutting. Está apoyado por el Apache Software Foundation y se distribuye bajo la Apache Software License. Lucene tiene versiones para otros lenguajes incluyendo Delphi, Perl, C#, C++, Python, Ruby y PHP."); + assertLang("ru", "id", "4ru", "name", "Lucene", "subject", "The Apache Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска, написанная на Java. Может быть использована для поиска в интернете и других областях компьютерной лингвистики (аналитическая философия)."); + assertLang("de", "id", "5de", "name", "Lucene", "subject", "Lucene ist ein Freie-Software-Projekt der Apache Software Foundation, das eine Suchsoftware erstellt. Durch die hohe Leistungsfähigkeit und Skalierbarkeit können die Lucene-Werkzeuge für beliebige Projektgrößen und Anforderungen eingesetzt werden. So setzt beispielsweise Wikipedia Lucene für die Volltextsuche ein. Zudem verwenden die beiden Desktop-Suchprogramme Beagle und Strigi eine C#- bzw. C++- Portierung von Lucene als Indexer."); + } +} diff --git a/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc b/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc index 12e8804234c..7caccb72054 100644 --- a/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc +++ b/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc @@ -18,12 +18,13 @@ Solr can identify languages and map text to language-specific fields during indexing using the `langid` UpdateRequestProcessor. -Solr supports two implementations of this feature: +Solr supports three implementations of this feature: * Tika's language detection feature: http://tika.apache.org/0.10/detection.html * LangDetect language detection: https://github.com/shuyo/language-detection +* OpenNLP language detection: http://opennlp.apache.org/docs/1.8.4/manual/opennlp.html#tools.langdetect -You can see a comparison between the two implementations here: http://blog.mikemccandless.com/2011/10/accuracy-and-performance-of-googles.html. In general, the LangDetect implementation supports more languages with higher performance. +You can see a comparison between the Tika and LangDetect implementations here: http://blog.mikemccandless.com/2011/10/accuracy-and-performance-of-googles.html. In general, the LangDetect implementation supports more languages with higher performance. For specific information on each of these language identification implementations, including a list of supported languages for each, see the relevant project websites. @@ -61,6 +62,30 @@ Here is an example of a minimal LangDetect `langid` configuration in `solrconfig ---- +=== Configuring OpenNLP Language Detection + +Here is an example of a minimal OpenNLP `langid` configuration in `solrconfig.xml`: + +[source,xml] +---- + + + title,subject,text,keywords + language_s + langdetect-183.bin + + +---- + +==== OpenNLP-specific parameters + +`langid.model`:: +An OpenNLP language detection model. The OpenNLP project provides a pre-trained 103 language model on the http://opennlp.apache.org/models.html[OpenNLP site's model dowload page]. Model training instructions are provided on the http://opennlp.apache.org/docs/1.8.4/manual/opennlp.html#tools.langdetect[OpenNLP website]. This parameter is required. + +==== OpenNLP language codes + +`OpenNLPLangDetectUpdateProcessor` automatically converts the 3-letter ISO 639-3 codes detected by the OpenNLP model into 2-letter ISO 639-1 codes. + == langid Parameters As previously mentioned, both implementations of the `langid` UpdateRequestProcessor take the same parameters.
+ * See "Language Detector" section of + * https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html + */ +public class OpenNLPLangDetectUpdateProcessor extends LanguageIdentifierUpdateProcessor { + + private final LanguageDetectorModel model; + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + /** Maps ISO 639-3 (3-letter language code) to ISO 639-1 (2-letter language code) */ + private static final Map ISO639_MAP = make_ISO639_map(); + + public OpenNLPLangDetectUpdateProcessor(SolrQueryRequest req, SolrQueryResponse rsp, + UpdateRequestProcessor next, LanguageDetectorModel model) { + super(req, rsp, next); + this.model = model; + } + + @Override + protected List detectLanguage(SolrInputDocument doc) { + List languages = new ArrayList<>(); + String content = concatFields(doc); + if (content.length() != 0) { + LanguageDetectorME ldme = new LanguageDetectorME(model); + Language[] langs = ldme.predictLanguages(content); + for(Language language: langs){ + languages.add(new DetectedLanguage(ISO639_MAP.get(language.getLang()), language.getConfidence())); + } + } else { + log.debug("No input text to detect language from, returning empty list"); + } + return languages; + } + + private static Map make_ISO639_map() { + Map map = new HashMap<>(); + for (String lang : Locale.getISOLanguages()) { + Locale locale = new Locale(lang); + map.put(locale.getISO3Language(), locale.getLanguage()); + } + return map; + } +} diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java new file mode 100644 index 00000000000..dfbdcbdc51e --- /dev/null +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.update.processor; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.commons.io.IOUtils; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; +import org.apache.solr.core.SolrResourceLoader; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.util.SolrPluginUtils; +import org.apache.solr.util.plugin.SolrCoreAware; + +import opennlp.tools.langdetect.LanguageDetectorModel; + +/** + * Identifies the language of a set of input fields using Apache OpenNLP. + * + * The UpdateProcessorChain config entry can take a number of parameters + * which may also be passed as HTTP parameters on the update request + * and override the defaults. Here is the simplest processor config possible: + * + * + * <processor class="org.apache.solr.update.processor.OpenNLPLangDetectUpdateProcessorFactory"> + * <str name="langid.fl">title,text</str> + * <str name="langid.langField">language_s</str> + * <str name="langid.model">langdetect-183.bin</str> + * </processor> + * + * See http://wiki.apache.org/solr/LanguageDetection + */ +public class OpenNLPLangDetectUpdateProcessorFactory extends UpdateRequestProcessorFactory + implements SolrCoreAware { + + private static final String MODEL_PARAM = "langid.model"; + private String modelFile; + private LanguageDetectorModel model; + protected SolrParams defaults; + protected SolrParams appends; + protected SolrParams invariants; + private SolrResourceLoader solrResourceLoader; + + @Override + public void init( NamedList args ) + { + if (args != null) { + Object o; + o = args.get("defaults"); + if (o != null && o instanceof NamedList) { + defaults = SolrParams.toSolrParams((NamedList) o); + } else { + defaults = SolrParams.toSolrParams(args); + } + o = args.get("appends"); + if (o != null && o instanceof NamedList) { + appends = SolrParams.toSolrParams((NamedList) o); + } + o = args.get("invariants"); + if (o != null && o instanceof NamedList) { + invariants = SolrParams.toSolrParams((NamedList) o); + } + + // Look for model filename in invariants, then in args, then defaults + if (invariants != null) { + modelFile = invariants.get(MODEL_PARAM); + } + if (modelFile == null) { + o = args.get(MODEL_PARAM); + if (o != null && o instanceof String) { + modelFile = (String)o; + } else { + modelFile = defaults.get(MODEL_PARAM); + if (modelFile == null) { + throw new RuntimeException("Couldn't load language model, will return empty languages always!"); + } + } + } + } + } + + @Override + public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) { + // Process defaults, appends and invariants if we got a request + if (req != null) { + SolrPluginUtils.setDefaults(req, defaults, appends, invariants); + } + return new OpenNLPLangDetectUpdateProcessor(req, rsp, next, model); + } + + private void loadModel() throws IOException { + InputStream is = null; + try{ + if (modelFile != null) { + is = solrResourceLoader.openResource(modelFile); + model = new LanguageDetectorModel(is); + } + } + finally{ + IOUtils.closeQuietly(is); + } + } + + @Override + public void inform(SolrCore core){ + solrResourceLoader = core.getResourceLoader(); + try { + loadModel(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java index df0e5f7fa25..5c8146d1db5 100644 --- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java @@ -28,8 +28,6 @@ import org.apache.solr.common.SolrInputDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Collection; - /** * Identifies the language of a set of input fields using Tika's * LanguageIdentifier. @@ -67,67 +65,4 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd } return languages; } - - - /** - * Concatenates content from multiple fields - */ - protected String concatFields(SolrInputDocument doc) { - StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields)); - for (String fieldName : inputFields) { - log.debug("Appending field " + fieldName); - if (doc.containsKey(fieldName)) { - Collection fieldValues = doc.getFieldValues(fieldName); - if (fieldValues != null) { - for (Object content : fieldValues) { - if (content instanceof String) { - String stringContent = (String) content; - if (stringContent.length() > maxFieldValueChars) { - sb.append(stringContent.substring(0, maxFieldValueChars)); - } else { - sb.append(stringContent); -} - sb.append(" "); - if (sb.length() > maxTotalChars) { - sb.setLength(maxTotalChars); - break; - } - } else { - log.warn("Field " + fieldName + " not a String value, not including in detection"); - } - } - } - } - } - return sb.toString(); - } - - /** - * Calculate expected string size. - * - * @param doc solr input document - * @param fields fields to select - * @return expected size of string value - */ - private int getExpectedSize(SolrInputDocument doc, String[] fields) { - int docSize = 0; - for (String field : fields) { - if (doc.containsKey(field)) { - Collection contents = doc.getFieldValues(field); - if (contents != null) { - for (Object content : contents) { - if (content instanceof String) { - docSize += Math.min(((String) content).length(), maxFieldValueChars); - } - } - - if (docSize > maxTotalChars) { - docSize = maxTotalChars; - break; - } - } - } - } - return docSize; - } } diff --git a/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/opennlp-langdetect.eng-swe-spa-rus-deu.bin b/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/opennlp-langdetect.eng-swe-spa-rus-deu.bin new file mode 100644 index 00000000000..ad584e65be0 Binary files /dev/null and b/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/opennlp-langdetect.eng-swe-spa-rus-deu.bin differ diff --git a/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml b/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml index 9ae54adc148..01dbee9aaea 100644 --- a/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml +++ b/solr/contrib/langid/src/test-files/langid/solr/collection1/conf/solrconfig-languageidentifier.xml @@ -57,11 +57,11 @@ - lang_id + lang_id_tika - + @@ -78,7 +78,7 @@ - + @@ -94,5 +94,22 @@ - + + + + + + true + name,subject + true + language_s + language_sm + th:thai + 0.3 + opennlp-langdetect.eng-swe-spa-rus-deu.bin + + + + + diff --git a/solr/contrib/langid/src/test-files/opennlp.langdetect.trainer.params.txt b/solr/contrib/langid/src/test-files/opennlp.langdetect.trainer.params.txt new file mode 100644 index 00000000000..1ecec823ff3 --- /dev/null +++ b/solr/contrib/langid/src/test-files/opennlp.langdetect.trainer.params.txt @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +Algorithm=PERCEPTRON +Cutoff=0 diff --git a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java index b90f54a4d3f..21ecd7d6a08 100644 --- a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java +++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java @@ -38,7 +38,11 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S public static void beforeClass() throws Exception { initCore("solrconfig-languageidentifier.xml", "schema.xml", getFile("langid/solr").getAbsolutePath()); SolrCore core = h.getCore(); - UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("lang_id"); + UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("lang_id_tika"); + assertNotNull(chained); + chained = core.getUpdateProcessingChain("lang_id_lang_detect"); + assertNotNull(chained); + chained = core.getUpdateProcessingChain("lang_id_opennlp"); assertNotNull(chained); } diff --git a/solr/contrib/langid/src/test/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactoryTest.java b/solr/contrib/langid/src/test/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactoryTest.java new file mode 100644 index 00000000000..7b95e6f4ce8 --- /dev/null +++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactoryTest.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.update.processor; + +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.junit.Test; + +public class OpenNLPLangDetectUpdateProcessorFactoryTest extends LanguageIdentifierUpdateProcessorFactoryTestCase { + private static final String TEST_MODEL = "opennlp-langdetect.eng-swe-spa-rus-deu.bin"; + + @Override + protected OpenNLPLangDetectUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception { + if (parameters.get("langid.model") == null) { // handle superclass tests that don't provide the model filename + parameters.set("langid.model", TEST_MODEL); + } + if (parameters.get("langid.threshold") == null) { // handle superclass tests that don't provide confidence threshold + parameters.set("langid.threshold", "0.3"); + } + SolrQueryRequest req = _parser.buildRequestFrom(h.getCore(), new ModifiableSolrParams(), null); + OpenNLPLangDetectUpdateProcessorFactory factory = new OpenNLPLangDetectUpdateProcessorFactory(); + factory.init(parameters.toNamedList()); + factory.inform(h.getCore()); + return (OpenNLPLangDetectUpdateProcessor)factory.getInstance(req, resp, null); + } + + // this one actually works better it seems with short docs + @Override + protected SolrInputDocument tooShortDoc() { + SolrInputDocument doc = new SolrInputDocument(); + doc.addField("text", ""); + return doc; + } + + @Test @Override + public void testLangIdGlobal() throws Exception { + ModifiableSolrParams parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "name,subject"); + parameters.add("langid.langField", "language_s"); + parameters.add("langid.model", TEST_MODEL); + parameters.add("langid.threshold", "0.3"); + liProcessor = createLangIdProcessor(parameters); + + assertLang("en", "id", "1en", "name", "Lucene", "subject", "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License."); + assertLang("sv", "id", "2sv", "name", "Maven", "subject", "Apache Maven är ett verktyg utvecklat av Apache Software Foundation och används inom systemutveckling av datorprogram i programspråket Java. Maven används för att automatiskt paketera (bygga) programfilerna till en distribuerbar enhet. Maven används inom samma område som Apache Ant men dess byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade."); + assertLang("es", "id", "3es", "name", "Lucene", "subject", "Lucene es un API de código abierto para recuperación de información, originalmente implementada en Java por Doug Cutting. Está apoyado por el Apache Software Foundation y se distribuye bajo la Apache Software License. Lucene tiene versiones para otros lenguajes incluyendo Delphi, Perl, C#, C++, Python, Ruby y PHP."); + assertLang("ru", "id", "4ru", "name", "Lucene", "subject", "The Apache Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска, написанная на Java. Может быть использована для поиска в интернете и других областях компьютерной лингвистики (аналитическая философия)."); + assertLang("de", "id", "5de", "name", "Lucene", "subject", "Lucene ist ein Freie-Software-Projekt der Apache Software Foundation, das eine Suchsoftware erstellt. Durch die hohe Leistungsfähigkeit und Skalierbarkeit können die Lucene-Werkzeuge für beliebige Projektgrößen und Anforderungen eingesetzt werden. So setzt beispielsweise Wikipedia Lucene für die Volltextsuche ein. Zudem verwenden die beiden Desktop-Suchprogramme Beagle und Strigi eine C#- bzw. C++- Portierung von Lucene als Indexer."); + } +} diff --git a/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc b/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc index 12e8804234c..7caccb72054 100644 --- a/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc +++ b/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc @@ -18,12 +18,13 @@ Solr can identify languages and map text to language-specific fields during indexing using the `langid` UpdateRequestProcessor. -Solr supports two implementations of this feature: +Solr supports three implementations of this feature: * Tika's language detection feature: http://tika.apache.org/0.10/detection.html * LangDetect language detection: https://github.com/shuyo/language-detection +* OpenNLP language detection: http://opennlp.apache.org/docs/1.8.4/manual/opennlp.html#tools.langdetect -You can see a comparison between the two implementations here: http://blog.mikemccandless.com/2011/10/accuracy-and-performance-of-googles.html. In general, the LangDetect implementation supports more languages with higher performance. +You can see a comparison between the Tika and LangDetect implementations here: http://blog.mikemccandless.com/2011/10/accuracy-and-performance-of-googles.html. In general, the LangDetect implementation supports more languages with higher performance. For specific information on each of these language identification implementations, including a list of supported languages for each, see the relevant project websites. @@ -61,6 +62,30 @@ Here is an example of a minimal LangDetect `langid` configuration in `solrconfig ---- +=== Configuring OpenNLP Language Detection + +Here is an example of a minimal OpenNLP `langid` configuration in `solrconfig.xml`: + +[source,xml] +---- + + + title,subject,text,keywords + language_s + langdetect-183.bin + + +---- + +==== OpenNLP-specific parameters + +`langid.model`:: +An OpenNLP language detection model. The OpenNLP project provides a pre-trained 103 language model on the http://opennlp.apache.org/models.html[OpenNLP site's model dowload page]. Model training instructions are provided on the http://opennlp.apache.org/docs/1.8.4/manual/opennlp.html#tools.langdetect[OpenNLP website]. This parameter is required. + +==== OpenNLP language codes + +`OpenNLPLangDetectUpdateProcessor` automatically converts the 3-letter ISO 639-3 codes detected by the OpenNLP model into 2-letter ISO 639-1 codes. + == langid Parameters As previously mentioned, both implementations of the `langid` UpdateRequestProcessor take the same parameters.
+ * The UpdateProcessorChain config entry can take a number of parameters + * which may also be passed as HTTP parameters on the update request + * and override the defaults. Here is the simplest processor config possible: + * + *
+ * <processor class="org.apache.solr.update.processor.OpenNLPLangDetectUpdateProcessorFactory"> + * <str name="langid.fl">title,text</str> + * <str name="langid.langField">language_s</str> + * <str name="langid.model">langdetect-183.bin</str> + * </processor> + *