From 57c0d29114b60f74e43ceb0914c26d490b61a265 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 16 Aug 2013 13:12:24 +0200 Subject: [PATCH] Prevent Phrase Suggester from failing on missing fields. Unless the field is not mapped phrase suggester should return empty results or skip candidate generation if a field in not in the index rather than failing hard with an illegal argument exception. Some shards might not have a value in a certain field. Closes #3469 --- .../phrase/DirectCandidateGenerator.java | 29 ++-- .../search/suggest/phrase/LaplaceScorer.java | 13 +- .../phrase/LinearInterpoatingScorer.java | 9 +- .../suggest/phrase/PhraseSuggestParser.java | 24 ++-- .../suggest/phrase/PhraseSuggester.java | 77 ++++++----- .../suggest/phrase/StupidBackoffScorer.java | 13 +- .../search/suggest/phrase/WordScorer.java | 14 +- .../search/suggest/SuggestSearchTests.java | 129 +++++++++++++++++- .../phrase/NoisyChannelSpellCheckerTests.java | 54 +++----- 9 files changed, 248 insertions(+), 114 deletions(-) diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java b/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java index c450b402286..4760e93dbaf 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java @@ -18,18 +18,8 @@ */ package org.elasticsearch.search.suggest.phrase; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.*; import org.apache.lucene.search.spell.DirectSpellChecker; import org.apache.lucene.search.spell.SuggestMode; import org.apache.lucene.search.spell.SuggestWord; @@ -38,6 +28,12 @@ import org.apache.lucene.util.CharsRef; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.search.suggest.SuggestUtils; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + //TODO public for tests public final class DirectCandidateGenerator extends CandidateGenerator { @@ -58,20 +54,19 @@ public final class DirectCandidateGenerator extends CandidateGenerator { private final int numCandidates; public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader, double nonErrorLikelihood, int numCandidates) throws IOException { - this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, numCandidates, null, null); + this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, numCandidates, null, null, MultiFields.getTerms(reader, field)); } - public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader, double nonErrorLikelihood, int numCandidates, Analyzer preFilter, Analyzer postFilter) throws IOException { + public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader, double nonErrorLikelihood, int numCandidates, Analyzer preFilter, Analyzer postFilter, Terms terms) throws IOException { + if (terms == null) { + throw new ElasticSearchIllegalArgumentException("generator field [" + field + "] doesn't exist"); + } this.spellchecker = spellchecker; this.field = field; this.numCandidates = numCandidates; this.suggestMode = suggestMode; this.reader = reader; - Terms terms = MultiFields.getTerms(reader, field); - if (terms == null) { - throw new ElasticSearchIllegalArgumentException("generator field [" + field + "] doesn't exist"); - } final long dictSize = terms.getSumTotalTermFreq(); this.useTotalTermFrequency = dictSize != -1; this.dictSize = dictSize == -1 ? reader.maxDoc() : dictSize; diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java b/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java index d2973525678..bb0cd5c78d2 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java @@ -18,27 +18,28 @@ */ package org.elasticsearch.search.suggest.phrase; -import java.io.IOException; - import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Terms; import org.apache.lucene.util.BytesRef; import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; + +import java.io.IOException; //TODO public for tests public final class LaplaceScorer extends WordScorer { public static final WordScorerFactory FACTORY = new WordScorer.WordScorerFactory() { @Override - public WordScorer newScorer(IndexReader reader, String field, double realWordLikelyhood, BytesRef separator) throws IOException { - return new LaplaceScorer(reader, field, realWordLikelyhood, separator, 0.5); + public WordScorer newScorer(IndexReader reader, Terms terms, String field, double realWordLikelyhood, BytesRef separator) throws IOException { + return new LaplaceScorer(reader, terms, field, realWordLikelyhood, separator, 0.5); } }; private double alpha; - public LaplaceScorer(IndexReader reader, String field, + public LaplaceScorer(IndexReader reader, Terms terms, String field, double realWordLikelyhood, BytesRef separator, double alpha) throws IOException { - super(reader, field, realWordLikelyhood, separator); + super(reader, terms, field, realWordLikelyhood, separator); this.alpha = alpha; } diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpoatingScorer.java b/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpoatingScorer.java index 0fda2d5b23a..9c7d2ae68cb 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpoatingScorer.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpoatingScorer.java @@ -18,13 +18,14 @@ */ package org.elasticsearch.search.suggest.phrase; -import java.io.IOException; - import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Terms; import org.apache.lucene.util.BytesRef; import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; +import java.io.IOException; + //TODO public for tests public final class LinearInterpoatingScorer extends WordScorer { @@ -32,9 +33,9 @@ public final class LinearInterpoatingScorer extends WordScorer { private final double bigramLambda; private final double trigramLambda; - public LinearInterpoatingScorer(IndexReader reader, String field, double realWordLikelyhood, BytesRef separator, double trigramLambda, double bigramLambda, double unigramLambda) + public LinearInterpoatingScorer(IndexReader reader, Terms terms, String field, double realWordLikelyhood, BytesRef separator, double trigramLambda, double bigramLambda, double unigramLambda) throws IOException { - super(reader, field, realWordLikelyhood, separator); + super(reader, terms, field, realWordLikelyhood, separator); double sum = unigramLambda + bigramLambda + trigramLambda; this.unigramLambda = unigramLambda / sum; this.bigramLambda = bigramLambda / sum; diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestParser.java b/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestParser.java index f87b696690f..6aaf3a50c11 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestParser.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestParser.java @@ -18,10 +18,9 @@ */ package org.elasticsearch.search.suggest.phrase; -import java.io.IOException; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Terms; import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.common.xcontent.XContentParser; @@ -33,6 +32,8 @@ import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.SuggestionSearchContext; import org.elasticsearch.search.suggest.phrase.PhraseSuggestionContext.DirectCandidateGenerator; +import java.io.IOException; + public final class PhraseSuggestParser implements SuggestContextParser { private PhraseSuggester suggester; @@ -135,6 +136,10 @@ public final class PhraseSuggestParser implements SuggestContextParser { throw new ElasticSearchIllegalArgumentException("The required field option is missing"); } + if (mapperService.smartNameFieldMapper(suggestion.getField()) == null) { + throw new ElasticSearchIllegalArgumentException("No mapping found for field [" + suggestion.getField() + "]"); + } + if (suggestion.model() == null) { suggestion.setModel(StupidBackoffScorer.FACTORY); } @@ -209,9 +214,9 @@ public final class PhraseSuggestParser implements SuggestContextParser { } suggestion.setModel(new WordScorer.WordScorerFactory() { @Override - public WordScorer newScorer(IndexReader reader, String field, double realWordLikelyhood, BytesRef separator) + public WordScorer newScorer(IndexReader reader, Terms terms, String field, double realWordLikelyhood, BytesRef separator) throws IOException { - return new LinearInterpoatingScorer(reader, field, realWordLikelyhood, separator, lambdas[0], lambdas[1], + return new LinearInterpoatingScorer(reader, terms, field, realWordLikelyhood, separator, lambdas[0], lambdas[1], lambdas[2]); } }); @@ -230,9 +235,9 @@ public final class PhraseSuggestParser implements SuggestContextParser { final double alpha = theAlpha; suggestion.setModel(new WordScorer.WordScorerFactory() { @Override - public WordScorer newScorer(IndexReader reader, String field, double realWordLikelyhood, BytesRef separator) + public WordScorer newScorer(IndexReader reader, Terms terms, String field, double realWordLikelyhood, BytesRef separator) throws IOException { - return new LaplaceScorer(reader, field, realWordLikelyhood, separator, alpha); + return new LaplaceScorer(reader, terms, field, realWordLikelyhood, separator, alpha); } }); @@ -250,9 +255,9 @@ public final class PhraseSuggestParser implements SuggestContextParser { final double discount = theDiscount; suggestion.setModel(new WordScorer.WordScorerFactory() { @Override - public WordScorer newScorer(IndexReader reader, String field, double realWordLikelyhood, BytesRef separator) + public WordScorer newScorer(IndexReader reader, Terms terms, String field, double realWordLikelyhood, BytesRef separator) throws IOException { - return new StupidBackoffScorer(reader, field, realWordLikelyhood, separator, discount); + return new StupidBackoffScorer(reader, terms, field, realWordLikelyhood, separator, discount); } }); @@ -281,6 +286,9 @@ public final class PhraseSuggestParser implements SuggestContextParser { if (!SuggestUtils.parseDirectSpellcheckerSettings(parser, fieldName, generator)) { if ("field".equals(fieldName)) { generator.setField(parser.text()); + if (mapperService.smartNameFieldMapper(generator.field()) == null) { + throw new ElasticSearchIllegalArgumentException("No mapping found for field [" + generator.field() + "]"); + } } else if ("size".equals(fieldName)) { generator.size(parser.intValue()); } else if ("pre_filter".equals(fieldName) || "preFilter".equals(fieldName)) { diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java b/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java index cedb1212487..1c06409647e 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java @@ -21,6 +21,8 @@ package org.elasticsearch.search.suggest.phrase; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Terms; import org.apache.lucene.search.spell.DirectSpellChecker; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; @@ -30,9 +32,11 @@ import org.elasticsearch.common.text.Text; import org.elasticsearch.search.suggest.Suggest.Suggestion; import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry; import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option; -import org.elasticsearch.search.suggest.SuggestContextParser; -import org.elasticsearch.search.suggest.SuggestUtils; -import org.elasticsearch.search.suggest.Suggester; +import org.elasticsearch.search.suggest.*; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.io.IOException; import java.util.List; @@ -52,41 +56,50 @@ public final class PhraseSuggester extends Suggester { public Suggestion> innerExecute(String name, PhraseSuggestionContext suggestion, IndexReader indexReader, CharsRef spare) throws IOException { double realWordErrorLikelihood = suggestion.realworldErrorLikelyhood(); - List generators = suggestion.generators(); - CandidateGenerator[] gens = new CandidateGenerator[generators.size()]; - for (int i = 0; i < gens.length; i++) { - PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i); - DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(generator); - gens[i] = new DirectCandidateGenerator(directSpellChecker, generator.field(), generator.suggestMode(), indexReader, realWordErrorLikelihood, generator.size(), generator.preFilter(), generator.postFilter()); - } - - - final NoisyChannelSpellChecker checker = new NoisyChannelSpellChecker(realWordErrorLikelihood, suggestion.getRequireUnigram(), suggestion.getTokenLimit()); - final BytesRef separator = suggestion.separator(); - TokenStream stream = checker.tokenStream(suggestion.getAnalyzer(), suggestion.getText(), spare, suggestion.getField()); - WordScorer wordScorer = suggestion.model().newScorer(indexReader, suggestion.getField(), realWordErrorLikelihood, separator); - Correction[] corrections = checker.getCorrections(stream, new MultiCandidateGeneratorWrapper(suggestion.getShardSize(), gens), suggestion.maxErrors(), - suggestion.getShardSize(), indexReader,wordScorer , separator, suggestion.confidence(), suggestion.gramSize()); - UnicodeUtil.UTF8toUTF16(suggestion.getText(), spare); - Suggestion.Entry