From c3513e928141ad9ef0aa4e4fef641c70d0e4b4be Mon Sep 17 00:00:00 2001 From: Steve Rowe Date: Mon, 13 Nov 2017 17:57:24 -0500 Subject: [PATCH] SOLR-11231: Guard against unset fields when performing language detection. This closes #232 --- solr/CHANGES.txt | 3 +++ .../TikaLanguageIdentifierUpdateProcessor.java | 18 +++++++++++++----- ...entifierUpdateProcessorFactoryTestCase.java | 13 +++++++++++++ 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index d151353e939..f872b969a2e 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -105,6 +105,9 @@ Bug Fixes * SOLR-11619: V2 requests that needed to be forwarded to other nodes would get an NPE. (David Smiley) +* SOLR-11231: Guard against unset fields when performing language detection. + (Chris Beer via Steve Rowe) + Optimizations ---------------------- * SOLR-11285: Refactor autoscaling framework to avoid direct references to Zookeeper and Solr diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java index 836a3bf6795..df0e5f7fa25 100644 --- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java +++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java @@ -112,13 +112,21 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd private int getExpectedSize(SolrInputDocument doc, String[] fields) { int docSize = 0; for (String field : fields) { - Collection contents = doc.getFieldValues(field); - for (Object content : contents) { - if (content instanceof String) { - docSize += Math.min(((String) content).length(), maxFieldValueChars); + if (doc.containsKey(field)) { + Collection contents = doc.getFieldValues(field); + if (contents != null) { + for (Object content : contents) { + if (content instanceof String) { + docSize += Math.min(((String) content).length(), maxFieldValueChars); + } + } + + if (docSize > maxTotalChars) { + docSize = maxTotalChars; + break; + } } } - docSize = Math.min(docSize, maxTotalChars); } return docSize; } diff --git a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java index c7381a918e7..b90f54a4d3f 100644 --- a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java +++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java @@ -211,6 +211,19 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S assertEquals("", liProcessor.process(doc).getFieldValue("language")); } + @Test + public void testMissingFieldEmptyString() throws Exception { + SolrInputDocument doc; + ModifiableSolrParams parameters = new ModifiableSolrParams(); + parameters.add("langid.fl", "no_such_field"); + parameters.add("langid.langField", "language"); + parameters.add("langid.enforceSchema", "false"); + liProcessor = createLangIdProcessor(parameters); + + doc = new SolrInputDocument(); + assertEquals("", liProcessor.process(doc).getFieldValue("language")); + } + @Test public void testFallback() throws Exception { SolrInputDocument doc;