From d6e16b6e7446e2340142ab9b5e09adde4bf737fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Wed, 10 Aug 2016 12:52:26 +0200 Subject: [PATCH 1/5] Move getDirectSpellChecker to DirectSpellcheckerSettings --- .../suggest/DirectSpellcheckerSettings.java | 35 +++++++++++++++++++ .../suggest/phrase/PhraseSuggester.java | 3 +- .../search/suggest/term/TermSuggester.java | 2 +- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java b/core/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java index 81c73df53fa..4571a4f20ed 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java @@ -21,8 +21,13 @@ package org.elasticsearch.search.suggest; import org.apache.lucene.search.spell.DirectSpellChecker; import org.apache.lucene.search.spell.StringDistance; import org.apache.lucene.search.spell.SuggestMode; +import org.apache.lucene.search.spell.SuggestWord; +import org.apache.lucene.search.spell.SuggestWordFrequencyComparator; +import org.apache.lucene.search.spell.SuggestWordQueue; import org.apache.lucene.util.automaton.LevenshteinAutomata; +import java.util.Comparator; + public class DirectSpellcheckerSettings { // NB: If this changes, make sure to change the default in TermBuilderSuggester @@ -49,6 +54,9 @@ public class DirectSpellcheckerSettings { private int minWordLength = DEFAULT_MIN_WORD_LENGTH; private float minDocFreq = DEFAULT_MIN_DOC_FREQ; + private static final Comparator LUCENE_FREQUENCY = new SuggestWordFrequencyComparator(); + private static final Comparator SCORE_COMPARATOR = SuggestWordQueue.DEFAULT_COMPARATOR; + public SuggestMode suggestMode() { return suggestMode; } @@ -129,6 +137,33 @@ public class DirectSpellcheckerSettings { this.minDocFreq = minDocFreq; } + public DirectSpellChecker getDirectSpellChecker() { + + DirectSpellChecker directSpellChecker = new DirectSpellChecker(); + directSpellChecker.setAccuracy(accuracy()); + Comparator comparator; + switch (sort()) { + case SCORE: + comparator = SCORE_COMPARATOR; + break; + case FREQUENCY: + comparator = LUCENE_FREQUENCY; + break; + default: + throw new IllegalArgumentException("Illegal suggest sort: " + sort()); + } + directSpellChecker.setComparator(comparator); + directSpellChecker.setDistance(stringDistance()); + directSpellChecker.setMaxEdits(maxEdits()); + directSpellChecker.setMaxInspections(maxInspections()); + directSpellChecker.setMaxQueryFrequency(maxTermFreq()); + directSpellChecker.setMinPrefix(prefixLength()); + directSpellChecker.setMinQueryLength(minWordLength()); + directSpellChecker.setThresholdFrequency(minDocFreq()); + directSpellChecker.setLowerCaseTerms(false); + return directSpellChecker; + } + @Override public String toString() { return "[" + diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java index 25f589794f3..d1324f7b3df 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java @@ -45,7 +45,6 @@ import org.elasticsearch.script.ScriptService; import org.elasticsearch.search.suggest.Suggest.Suggestion; import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry; import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option; -import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.Suggester; import org.elasticsearch.search.suggest.SuggestionBuilder; import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext; @@ -84,7 +83,7 @@ public final class PhraseSuggester extends Suggester { final List gens = new ArrayList<>(generators.size()); for (int i = 0; i < numGenerators; i++) { PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i); - DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(generator); + DirectSpellChecker directSpellChecker = generator.getDirectSpellChecker(); Terms terms = MultiFields.getTerms(indexReader, generator.field()); if (terms != null) { gens.add(new DirectCandidateGenerator(directSpellChecker, generator.field(), generator.suggestMode(), diff --git a/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java index a06baccb999..ed71c997d45 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java @@ -48,7 +48,7 @@ public final class TermSuggester extends Suggester { @Override public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException { - DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(suggestion.getDirectSpellCheckerSettings()); + DirectSpellChecker directSpellChecker = suggestion.getDirectSpellCheckerSettings().getDirectSpellChecker(); final IndexReader indexReader = searcher.getIndexReader(); TermSuggestion response = new TermSuggestion( name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort() From cdc77648a1a3b13bb30696cdd640caf21a6ad034 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Wed, 10 Aug 2016 13:00:30 +0200 Subject: [PATCH 2/5] Move analysis helper methods to DirectCandidateGenerator --- .../search/suggest/SuggestUtils.java | 43 ----------------- .../phrase/DirectCandidateGenerator.java | 46 +++++++++++++++++-- .../phrase/NoisyChannelSpellChecker.java | 20 ++++---- .../search/suggest/term/TermSuggester.java | 3 +- 4 files changed, 55 insertions(+), 57 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java b/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java index f3a034cda61..744552e7ea4 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java @@ -18,7 +18,6 @@ */ package org.elasticsearch.search.suggest; -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -29,11 +28,7 @@ import org.apache.lucene.search.spell.SuggestWordFrequencyComparator; import org.apache.lucene.search.spell.SuggestWordQueue; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.CharsRefBuilder; -import org.apache.lucene.util.IOUtils; import org.elasticsearch.common.ParseField; -import org.elasticsearch.common.io.FastCharArrayReader; import java.io.IOException; import java.util.Comparator; @@ -103,44 +98,6 @@ public final class SuggestUtils { public void end() {} } - public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException { - spare.copyUTF8Bytes(toAnalyze); - return analyze(analyzer, spare.get(), field, consumer); - } - - public static int analyze(Analyzer analyzer, CharsRef toAnalyze, String field, TokenConsumer consumer) throws IOException { - try (TokenStream ts = analyzer.tokenStream( - field, new FastCharArrayReader(toAnalyze.chars, toAnalyze.offset, toAnalyze.length))) { - return analyze(ts, consumer); - } - } - - /** NOTE: this method closes the TokenStream, even on exception, which is awkward - * because really the caller who called {@link Analyzer#tokenStream} should close it, - * but when trying that there are recursion issues when we try to use the same - * TokenStream twice in the same recursion... */ - public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException { - int numTokens = 0; - boolean success = false; - try { - stream.reset(); - consumer.reset(stream); - while (stream.incrementToken()) { - consumer.nextToken(); - numTokens++; - } - consumer.end(); - success = true; - } finally { - if (success) { - stream.close(); - } else { - IOUtils.closeWhileHandlingException(stream); - } - } - return numTokens; - } - public static class Fields { public static final ParseField STRING_DISTANCE = new ParseField("string_distance"); public static final ParseField SUGGEST_MODE = new ParseField("suggest_mode"); diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java index 67fed51b622..82ae3a02d33 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java @@ -19,6 +19,7 @@ package org.elasticsearch.search.suggest.phrase; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; @@ -29,8 +30,12 @@ import org.apache.lucene.search.spell.SuggestMode; import org.apache.lucene.search.spell.SuggestWord; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.IOUtils; +import org.elasticsearch.common.io.FastCharArrayReader; import org.elasticsearch.search.suggest.SuggestUtils; +import org.elasticsearch.search.suggest.SuggestUtils.TokenConsumer; import java.io.IOException; import java.util.ArrayList; @@ -44,7 +49,7 @@ import static java.lang.Math.log10; import static java.lang.Math.max; import static java.lang.Math.round; -final class DirectCandidateGenerator extends CandidateGenerator { +public final class DirectCandidateGenerator extends CandidateGenerator { private final DirectSpellChecker spellchecker; private final String field; @@ -140,7 +145,7 @@ final class DirectCandidateGenerator extends CandidateGenerator { return term; } final BytesRefBuilder result = byteSpare; - SuggestUtils.analyze(preFilter, term, field, new SuggestUtils.TokenConsumer() { + analyze(preFilter, term, field, new SuggestUtils.TokenConsumer() { @Override public void nextToken() throws IOException { @@ -156,7 +161,7 @@ final class DirectCandidateGenerator extends CandidateGenerator { candidates.add(candidate); } else { final BytesRefBuilder result = byteSpare; - SuggestUtils.analyze(postFilter, candidate.term, field, new SuggestUtils.TokenConsumer() { + analyze(postFilter, candidate.term, field, new SuggestUtils.TokenConsumer() { @Override public void nextToken() throws IOException { this.fillBytesRef(result); @@ -283,4 +288,39 @@ final class DirectCandidateGenerator extends CandidateGenerator { return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput); } + public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException { + spare.copyUTF8Bytes(toAnalyze); + CharsRef charsRef = spare.get(); + try (TokenStream ts = analyzer.tokenStream( + field, new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) { + return analyze(ts, consumer); + } + } + + /** NOTE: this method closes the TokenStream, even on exception, which is awkward + * because really the caller who called {@link Analyzer#tokenStream} should close it, + * but when trying that there are recursion issues when we try to use the same + * TokenStream twice in the same recursion... */ + public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException { + int numTokens = 0; + boolean success = false; + try { + stream.reset(); + consumer.reset(stream); + while (stream.incrementToken()) { + consumer.nextToken(); + numTokens++; + } + consumer.end(); + success = true; + } finally { + if (success) { + stream.close(); + } else { + IOUtils.closeWhileHandlingException(stream); + } + } + return numTokens; + } + } diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java index ec9ca6e1da2..223b169be6b 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java @@ -51,19 +51,19 @@ public final class NoisyChannelSpellChecker { public NoisyChannelSpellChecker(double nonErrorLikelihood) { this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT); } - + public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) { this.realWordLikelihood = nonErrorLikelihood; this.requireUnigram = requireUnigram; this.tokenLimit = tokenLimit; - + } public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException { - + final List candidateSetsList = new ArrayList<>(); - SuggestUtils.analyze(stream, new SuggestUtils.TokenConsumer() { + DirectCandidateGenerator.analyze(stream, new SuggestUtils.TokenConsumer() { CandidateSet currentSet = null; private TypeAttribute typeAttribute; private final BytesRefBuilder termsRef = new BytesRefBuilder(); @@ -74,7 +74,7 @@ public final class NoisyChannelSpellChecker { super.reset(stream); typeAttribute = stream.addAttribute(TypeAttribute.class); } - + @Override public void nextToken() throws IOException { anyTokens = true; @@ -96,7 +96,7 @@ public final class NoisyChannelSpellChecker { currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true)); } } - + @Override public void end() { if (currentSet != null) { @@ -107,11 +107,11 @@ public final class NoisyChannelSpellChecker { } } }); - + if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) { return Result.EMPTY; } - + for (CandidateSet candidateSet : candidateSetsList) { generator.drawCandidates(candidateSet); } @@ -127,13 +127,13 @@ public final class NoisyChannelSpellChecker { cutoffScore = inputPhraseScore * confidence; } Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore); - + return new Result(bestCandidates, cutoffScore); } public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator, float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException { - + return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, numCorrections, scorer, confidence, gramSize); } diff --git a/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java index ed71c997d45..6cca58286c5 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java @@ -34,6 +34,7 @@ import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.Suggester; import org.elasticsearch.search.suggest.SuggestionBuilder; import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext; +import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator; import java.io.IOException; import java.util.ArrayList; @@ -73,7 +74,7 @@ public final class TermSuggester extends Suggester { private List queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException { final List result = new ArrayList<>(); final String field = suggestion.getField(); - SuggestUtils.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() { + DirectCandidateGenerator.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() { @Override public void nextToken() { Term term = new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder()))); From e6d57af0c59eddba74aef15f971d5ef51e671d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Wed, 10 Aug 2016 13:06:43 +0200 Subject: [PATCH 3/5] Moving join() helper function to WordScorer --- .../search/suggest/SuggestUtils.java | 43 ------------------- .../search/suggest/phrase/Correction.java | 3 +- .../DirectCandidateGeneratorBuilder.java | 3 +- .../search/suggest/phrase/LaplaceScorer.java | 7 ++- .../phrase/LinearInterpolatingScorer.java | 7 ++- .../suggest/phrase/StupidBackoffScorer.java | 7 ++- .../search/suggest/phrase/WordScorer.java | 10 +++++ 7 files changed, 22 insertions(+), 58 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java b/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java index 744552e7ea4..dff02f7e887 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java @@ -22,61 +22,18 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.search.spell.DirectSpellChecker; -import org.apache.lucene.search.spell.SuggestWord; -import org.apache.lucene.search.spell.SuggestWordFrequencyComparator; -import org.apache.lucene.search.spell.SuggestWordQueue; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.elasticsearch.common.ParseField; import java.io.IOException; -import java.util.Comparator; public final class SuggestUtils { - private static final Comparator LUCENE_FREQUENCY = new SuggestWordFrequencyComparator(); - private static final Comparator SCORE_COMPARATOR = SuggestWordQueue.DEFAULT_COMPARATOR; private SuggestUtils() { // utils!! } - public static DirectSpellChecker getDirectSpellChecker(DirectSpellcheckerSettings suggestion) { - DirectSpellChecker directSpellChecker = new DirectSpellChecker(); - directSpellChecker.setAccuracy(suggestion.accuracy()); - Comparator comparator; - switch (suggestion.sort()) { - case SCORE: - comparator = SCORE_COMPARATOR; - break; - case FREQUENCY: - comparator = LUCENE_FREQUENCY; - break; - default: - throw new IllegalArgumentException("Illegal suggest sort: " + suggestion.sort()); - } - directSpellChecker.setComparator(comparator); - directSpellChecker.setDistance(suggestion.stringDistance()); - directSpellChecker.setMaxEdits(suggestion.maxEdits()); - directSpellChecker.setMaxInspections(suggestion.maxInspections()); - directSpellChecker.setMaxQueryFrequency(suggestion.maxTermFreq()); - directSpellChecker.setMinPrefix(suggestion.prefixLength()); - directSpellChecker.setMinQueryLength(suggestion.minWordLength()); - directSpellChecker.setThresholdFrequency(suggestion.minDocFreq()); - directSpellChecker.setLowerCaseTerms(false); - return directSpellChecker; - } - - public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) { - result.clear(); - for (int i = 0; i < toJoin.length - 1; i++) { - result.append(toJoin[i]); - result.append(separator); - } - result.append(toJoin[toJoin.length-1]); - return result.get(); - } - public abstract static class TokenConsumer { protected CharTermAttribute charTermAttr; protected PositionIncrementAttribute posIncAttr; diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/Correction.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/Correction.java index ffd21469f79..23db2b0fcbb 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/Correction.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/Correction.java @@ -20,7 +20,6 @@ package org.elasticsearch.search.suggest.phrase; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; -import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import java.util.Arrays; @@ -73,7 +72,7 @@ public final class Correction implements Comparable { len += toJoin[i].length; } result.grow(len); - return SuggestUtils.join(separator, result, toJoin); + return WordScorer.join(separator, result, toJoin); } /** Lower scores sorts first; if scores are equal, diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorBuilder.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorBuilder.java index 67b31043349..7c2616e0a49 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGeneratorBuilder.java @@ -449,7 +449,8 @@ public final class DirectCandidateGeneratorBuilder implements CandidateGenerator return new LuceneLevenshteinDistance(); } else if ("levenstein".equals(distanceVal)) { return new LevensteinDistance(); - //TODO Jaro and Winkler are 2 people - so apply same naming logic as damerau_levenshtein + // TODO Jaro and Winkler are 2 people - so apply same naming logic + // as damerau_levenshtein } else if ("jarowinkler".equals(distanceVal)) { return new JaroWinklerDistance(); } else if ("ngram".equals(distanceVal)) { diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java index 6b6301b49a4..562da448466 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java @@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Terms; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import java.io.IOException; @@ -41,15 +40,15 @@ final class LaplaceScorer extends WordScorer { @Override protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { - SuggestUtils.join(separator, spare, w_1.term, word.term); + join(separator, spare, w_1.term, word.term); return (alpha + frequency(spare.get())) / (alpha + w_1.frequency + vocabluarySize); } @Override protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException { - SuggestUtils.join(separator, spare, w_2.term, w_1.term, word.term); + join(separator, spare, w_2.term, w_1.term, word.term); long trigramCount = frequency(spare.get()); - SuggestUtils.join(separator, spare, w_1.term, word.term); + join(separator, spare, w_1.term, word.term); return (alpha + trigramCount) / (alpha + frequency(spare.get()) + vocabluarySize); } diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java index 2a5895a8320..c6d67fe8cf7 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpolatingScorer.java @@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Terms; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import java.io.IOException; @@ -56,7 +55,7 @@ public final class LinearInterpolatingScorer extends WordScorer { @Override protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { - SuggestUtils.join(separator, spare, w_1.term, word.term); + join(separator, spare, w_1.term, word.term); final long count = frequency(spare.get()); if (count < 1) { return unigramLambda * scoreUnigram(word); @@ -66,12 +65,12 @@ public final class LinearInterpolatingScorer extends WordScorer { @Override protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException { - SuggestUtils.join(separator, spare, w.term, w_1.term, w_2.term); + join(separator, spare, w.term, w_1.term, w_2.term); final long count = frequency(spare.get()); if (count < 1) { return scoreBigram(w, w_1); } - SuggestUtils.join(separator, spare, w.term, w_1.term); + join(separator, spare, w.term, w_1.term); return trigramLambda * (count / (1.d + frequency(spare.get()))) + scoreBigram(w, w_1); } diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java index ed0573bf006..8eb08ef0688 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java @@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Terms; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import java.io.IOException; @@ -41,7 +40,7 @@ class StupidBackoffScorer extends WordScorer { @Override protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { - SuggestUtils.join(separator, spare, w_1.term, word.term); + join(separator, spare, w_1.term, word.term); final long count = frequency(spare.get()); if (count < 1) { return discount * scoreUnigram(word); @@ -53,12 +52,12 @@ class StupidBackoffScorer extends WordScorer { protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException { // First see if there are bigrams. If there aren't then skip looking up the trigram. This saves lookups // when the bigrams and trigrams are rare and we need both anyway. - SuggestUtils.join(separator, spare, w_1.term, w.term); + join(separator, spare, w_1.term, w.term); long bigramCount = frequency(spare.get()); if (bigramCount < 1) { return discount * scoreUnigram(w); } - SuggestUtils.join(separator, spare, w_2.term, w_1.term, w.term); + join(separator, spare, w_2.term, w_1.term, w.term); long trigramCount = frequency(spare.get()); if (trigramCount < 1) { return discount * (bigramCount / (w_1.frequency + 0.00000000001d)); diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java index 69e62c1a175..32d4feb4b27 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java @@ -100,6 +100,16 @@ public abstract class WordScorer { return scoreBigram(word, w_1); } + public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) { + result.clear(); + for (int i = 0; i < toJoin.length - 1; i++) { + result.append(toJoin[i]); + result.append(separator); + } + result.append(toJoin[toJoin.length-1]); + return result.get(); + } + public interface WordScorerFactory { WordScorer newScorer(IndexReader reader, Terms terms, String field, double realWordLikelyhood, BytesRef separator) throws IOException; From 9c91ced02929f9702c0276a7ccb0053aeb811546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Wed, 10 Aug 2016 13:27:37 +0200 Subject: [PATCH 4/5] Removing use of ParseFields where we have alternative in other classes already --- .../search/suggest/SuggestUtils.java | 76 ------------------- .../CompletionSuggestionBuilder.java | 12 ++- .../phrase/DirectCandidateGenerator.java | 33 ++++++-- .../DirectCandidateGeneratorBuilder.java | 30 ++++---- .../phrase/NoisyChannelSpellChecker.java | 3 +- .../phrase/PhraseSuggestionBuilder.java | 3 +- .../search/suggest/term/TermSuggester.java | 6 +- .../suggest/term/TermSuggestionBuilder.java | 63 ++++++++------- 8 files changed, 84 insertions(+), 142 deletions(-) delete mode 100644 core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java diff --git a/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java b/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java deleted file mode 100644 index dff02f7e887..00000000000 --- a/core/src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.elasticsearch.search.suggest; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.elasticsearch.common.ParseField; - -import java.io.IOException; - -public final class SuggestUtils { - - private SuggestUtils() { - // utils!! - } - - public abstract static class TokenConsumer { - protected CharTermAttribute charTermAttr; - protected PositionIncrementAttribute posIncAttr; - protected OffsetAttribute offsetAttr; - - public void reset(TokenStream stream) { - charTermAttr = stream.addAttribute(CharTermAttribute.class); - posIncAttr = stream.addAttribute(PositionIncrementAttribute.class); - offsetAttr = stream.addAttribute(OffsetAttribute.class); - } - - protected BytesRef fillBytesRef(BytesRefBuilder spare) { - spare.copyChars(charTermAttr); - return spare.get(); - } - - public abstract void nextToken() throws IOException; - - public void end() {} - } - - public static class Fields { - public static final ParseField STRING_DISTANCE = new ParseField("string_distance"); - public static final ParseField SUGGEST_MODE = new ParseField("suggest_mode"); - public static final ParseField MAX_EDITS = new ParseField("max_edits"); - public static final ParseField MAX_INSPECTIONS = new ParseField("max_inspections"); - // TODO some of these constants are the same as MLT constants and - // could be moved to a shared class for consistency - public static final ParseField MAX_TERM_FREQ = new ParseField("max_term_freq"); - public static final ParseField PREFIX_LENGTH = new ParseField("prefix_length", "prefix_len"); - public static final ParseField MIN_WORD_LENGTH = new ParseField("min_word_length", "min_word_len"); - public static final ParseField MIN_DOC_FREQ = new ParseField("min_doc_freq"); - public static final ParseField SHARD_SIZE = new ParseField("shard_size"); - public static final ParseField ANALYZER = new ParseField("analyzer"); - public static final ParseField FIELD = new ParseField("field"); - public static final ParseField SIZE = new ParseField("size"); - public static final ParseField SORT = new ParseField("sort"); - public static final ParseField ACCURACY = new ParseField("accuracy"); - } -} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java index 783b6536e2e..5e98d7f09ef 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java @@ -37,7 +37,6 @@ import org.elasticsearch.index.mapper.core.CompletionFieldMapper; import org.elasticsearch.index.mapper.core.CompletionFieldMapper2x; import org.elasticsearch.index.query.QueryParseContext; import org.elasticsearch.index.query.QueryShardContext; -import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.SuggestionBuilder; import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext; import org.elasticsearch.search.suggest.completion.context.ContextMapping; @@ -48,7 +47,6 @@ import org.elasticsearch.search.suggest.completion2x.context.GeolocationContextM import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -90,10 +88,10 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder completionSuggestionContext.regexOptions = RegexOptions.parse(parser, context), RegexOptions.REGEX_OPTIONS, ObjectParser.ValueType.OBJECT); - TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::field, SuggestUtils.Fields.FIELD); - TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::analyzer, SuggestUtils.Fields.ANALYZER); - TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::size, SuggestUtils.Fields.SIZE); - TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::shardSize, SuggestUtils.Fields.SHARD_SIZE); + TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::field, FIELDNAME_FIELD); + TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::analyzer, ANALYZER_FIELD); + TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::size, SIZE_FIELD); + TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::shardSize, SHARDSIZE_FIELD); TLP_PARSER.declareField((p, v, c) -> { // Copy the current structure. We will parse, once the mapping is provided XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON); @@ -353,7 +351,7 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder candidateSetsList = new ArrayList<>(); - DirectCandidateGenerator.analyze(stream, new SuggestUtils.TokenConsumer() { + DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() { CandidateSet currentSet = null; private TypeAttribute typeAttribute; private final BytesRefBuilder termsRef = new BytesRefBuilder(); diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestionBuilder.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestionBuilder.java index 1e0e6680aa4..94ad7b8fad0 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestionBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestionBuilder.java @@ -43,7 +43,6 @@ import org.elasticsearch.script.CompiledScript; import org.elasticsearch.script.Script; import org.elasticsearch.script.ScriptContext; import org.elasticsearch.script.ScriptService; -import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.SuggestionBuilder; import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext; import org.elasticsearch.search.suggest.phrase.PhraseSuggestionContext.DirectCandidateGenerator; @@ -596,7 +595,7 @@ public class PhraseSuggestionBuilder extends SuggestionBuilder { return response; } - private List queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException { + private static List queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException { final List result = new ArrayList<>(); final String field = suggestion.getField(); - DirectCandidateGenerator.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() { + DirectCandidateGenerator.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, + new DirectCandidateGenerator.TokenConsumer() { @Override public void nextToken() { Term term = new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder()))); diff --git a/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggestionBuilder.java b/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggestionBuilder.java index d7ec7e6cae7..31e6c3718e8 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggestionBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggestionBuilder.java @@ -37,7 +37,6 @@ import org.elasticsearch.index.query.QueryParseContext; import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.search.suggest.DirectSpellcheckerSettings; import org.elasticsearch.search.suggest.SortBy; -import org.elasticsearch.search.suggest.SuggestUtils; import org.elasticsearch.search.suggest.SuggestionBuilder; import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext; @@ -52,16 +51,16 @@ import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAUL import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_DOC_FREQ; import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_WORD_LENGTH; import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_PREFIX_LENGTH; -import static org.elasticsearch.search.suggest.SuggestUtils.Fields.ACCURACY; -import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MAX_EDITS; -import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MAX_INSPECTIONS; -import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MAX_TERM_FREQ; -import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MIN_DOC_FREQ; -import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MIN_WORD_LENGTH; -import static org.elasticsearch.search.suggest.SuggestUtils.Fields.PREFIX_LENGTH; -import static org.elasticsearch.search.suggest.SuggestUtils.Fields.SORT; -import static org.elasticsearch.search.suggest.SuggestUtils.Fields.STRING_DISTANCE; -import static org.elasticsearch.search.suggest.SuggestUtils.Fields.SUGGEST_MODE; +import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.ACCURACY_FIELD; +import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_EDITS_FIELD; +import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_INSPECTIONS_FIELD; +import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_TERM_FREQ_FIELD; +import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_DOC_FREQ_FIELD; +import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_WORD_LENGTH_FIELD; +import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.PREFIX_LENGTH_FIELD; +import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SORT_FIELD; +import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.STRING_DISTANCE_FIELD; +import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SUGGESTMODE_FIELD; /** * Defines the actual suggest command. Each command uses the global options @@ -376,16 +375,16 @@ public class TermSuggestionBuilder extends SuggestionBuilder Date: Wed, 10 Aug 2016 18:03:39 +0200 Subject: [PATCH 5/5] Renaming method according to review comments --- .../search/suggest/DirectSpellcheckerSettings.java | 2 +- .../elasticsearch/search/suggest/phrase/PhraseSuggester.java | 2 +- .../org/elasticsearch/search/suggest/term/TermSuggester.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java b/core/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java index 4571a4f20ed..1250dfdac3b 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java @@ -137,7 +137,7 @@ public class DirectSpellcheckerSettings { this.minDocFreq = minDocFreq; } - public DirectSpellChecker getDirectSpellChecker() { + public DirectSpellChecker createDirectSpellChecker() { DirectSpellChecker directSpellChecker = new DirectSpellChecker(); directSpellChecker.setAccuracy(accuracy()); diff --git a/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java index d1324f7b3df..a9f5accb918 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java @@ -83,7 +83,7 @@ public final class PhraseSuggester extends Suggester { final List gens = new ArrayList<>(generators.size()); for (int i = 0; i < numGenerators; i++) { PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i); - DirectSpellChecker directSpellChecker = generator.getDirectSpellChecker(); + DirectSpellChecker directSpellChecker = generator.createDirectSpellChecker(); Terms terms = MultiFields.getTerms(indexReader, generator.field()); if (terms != null) { gens.add(new DirectCandidateGenerator(directSpellChecker, generator.field(), generator.suggestMode(), diff --git a/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java index fc6113f16be..0d58e0f5cac 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java @@ -48,7 +48,7 @@ public final class TermSuggester extends Suggester { @Override public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException { - DirectSpellChecker directSpellChecker = suggestion.getDirectSpellCheckerSettings().getDirectSpellChecker(); + DirectSpellChecker directSpellChecker = suggestion.getDirectSpellCheckerSettings().createDirectSpellChecker(); final IndexReader indexReader = searcher.getIndexReader(); TermSuggestion response = new TermSuggestion( name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort()