From 876b5a3dcd295f41d9ddf3f70b336d331ee82f20 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 5 Mar 2013 10:46:25 +0100 Subject: [PATCH] prefer totalTermFrequency over docFreq in PhraseSuggester --- .../suggest/phrase/CandidateGenerator.java | 4 +- .../phrase/DirectCandidateGenerator.java | 62 ++++++++++++------- .../search/suggest/phrase/LaplaceScorer.java | 11 +--- .../phrase/LinearInterpoatingScorer.java | 4 +- .../MultiCandidateGeneratorWrapper.java | 4 +- .../phrase/NoisyChannelSpellChecker.java | 2 +- .../suggest/phrase/StupidBackoffScorer.java | 8 +-- .../search/suggest/phrase/WordScorer.java | 17 ++--- .../phrase/NoisyChannelSpellCheckerTests.java | 38 +++++------- 9 files changed, 80 insertions(+), 70 deletions(-) diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java b/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java index 190a9f65bcd..bd361215145 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java @@ -29,7 +29,7 @@ public abstract class CandidateGenerator { public abstract boolean isKnownWord(BytesRef term) throws IOException; - public abstract int frequency(BytesRef term) throws IOException; + public abstract long frequency(BytesRef term) throws IOException; public CandidateSet drawCandidates(BytesRef term, int numCandidates) throws IOException { CandidateSet set = new CandidateSet(Candidate.EMPTY, createCandidate(term)); @@ -39,7 +39,7 @@ public abstract class CandidateGenerator { public Candidate createCandidate(BytesRef term) throws IOException { return createCandidate(term, frequency(term), 1.0); } - public abstract Candidate createCandidate(BytesRef term, int frequency, double channelScore) throws IOException; + public abstract Candidate createCandidate(BytesRef term, long frequency, double channelScore) throws IOException; public abstract CandidateSet drawCandidates(CandidateSet set, int numCandidates) throws IOException; diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java b/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java index 1ac21357775..0cddfdbe08f 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java @@ -29,6 +29,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.spell.DirectSpellChecker; import org.apache.lucene.search.spell.SuggestMode; import org.apache.lucene.search.spell.SuggestWord; @@ -43,13 +44,17 @@ public final class DirectCandidateGenerator extends CandidateGenerator { private final DirectSpellChecker spellchecker; private final String field; private final SuggestMode suggestMode; + private final TermsEnum termsEnum; private final IndexReader reader; - private final int docCount; + private final long dictSize; private final double logBase = 5; - private final int frequencyPlateau; + private final long frequencyPlateau; private final Analyzer preFilter; private final Analyzer postFilter; private final double nonErrorLikelihood; + private final boolean useTotalTermFrequency; + private final CharsRef spare = new CharsRef(); + private final BytesRef byteSpare = new BytesRef(); public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader, double nonErrorLikelihood) throws IOException { this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, null, null); @@ -65,13 +70,15 @@ public final class DirectCandidateGenerator extends CandidateGenerator { if (terms == null) { throw new ElasticSearchIllegalArgumentException("generator field [" + field + "] doesn't exist"); } - final int docCount = terms.getDocCount(); - this.docCount = docCount == -1 ? reader.maxDoc() : docCount; + final long dictSize = terms.getSumTotalTermFreq(); + this.useTotalTermFrequency = dictSize != -1; + this.dictSize = dictSize == -1 ? reader.maxDoc() : dictSize; this.preFilter = preFilter; this.postFilter = postFilter; this.nonErrorLikelihood = nonErrorLikelihood; float thresholdFrequency = spellchecker.getThresholdFrequency(); - this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int)(docCount * thresholdFrequency); + this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int)(dictSize * thresholdFrequency); + termsEnum = terms.iterator(null); } /* (non-Javadoc) @@ -86,8 +93,17 @@ public final class DirectCandidateGenerator extends CandidateGenerator { * @see org.elasticsearch.search.suggest.phrase.CandidateGenerator#frequency(org.apache.lucene.util.BytesRef) */ @Override - public int frequency(BytesRef term) throws IOException { - return reader.docFreq(new Term(field, term)); + public long frequency(BytesRef term) throws IOException { + term = preFilter(term, spare, byteSpare); + return internalFrequency(term); + } + + + public long internalFrequency(BytesRef term) throws IOException { + if (termsEnum.seekExact(term, true)) { + return useTotalTermFrequency ? termsEnum.totalTermFreq() : termsEnum.docFreq(); + } + return 0; } public String getField() { @@ -99,18 +115,16 @@ public final class DirectCandidateGenerator extends CandidateGenerator { */ @Override public CandidateSet drawCandidates(CandidateSet set, int numCandidates) throws IOException { - CharsRef spare = new CharsRef(); - BytesRef byteSpare = new BytesRef(); Candidate original = set.originalTerm; BytesRef term = preFilter(original.term, spare, byteSpare); - final int frequency = original.frequency; - spellchecker.setThresholdFrequency(thresholdFrequency(frequency, docCount)); + final long frequency = original.frequency; + spellchecker.setThresholdFrequency(thresholdFrequency(frequency, dictSize)); SuggestWord[] suggestSimilar = spellchecker.suggestSimilar(new Term(field, term), numCandidates, reader, this.suggestMode); List candidates = new ArrayList(suggestSimilar.length); for (int i = 0; i < suggestSimilar.length; i++) { SuggestWord suggestWord = suggestSimilar[i]; BytesRef candidate = new BytesRef(suggestWord.string); - postFilter(new Candidate(candidate, suggestWord.freq, suggestWord.score, score(suggestWord.freq, suggestWord.score, docCount)), spare, byteSpare, candidates); + postFilter(new Candidate(candidate, internalFrequency(candidate), suggestWord.score, score(suggestWord.freq, suggestWord.score, dictSize)), spare, byteSpare, candidates); } set.addCandidates(candidates); return set; @@ -140,24 +154,26 @@ public final class DirectCandidateGenerator extends CandidateGenerator { @Override public void nextToken() throws IOException { this.fillBytesRef(result); + if (posIncAttr.getPositionIncrement() > 0 && result.bytesEquals(candidate.term)) { - candidates.add(new Candidate(BytesRef.deepCopyOf(result), candidate.frequency, candidate.stringDistance, score(candidate.frequency, candidate.stringDistance, docCount))); + BytesRef term = BytesRef.deepCopyOf(result); + long freq = frequency(term); + candidates.add(new Candidate(BytesRef.deepCopyOf(term), freq, candidate.stringDistance, score(candidate.frequency, candidate.stringDistance, dictSize))); } else { - int freq = frequency(result); - candidates.add(new Candidate(BytesRef.deepCopyOf(result), freq, nonErrorLikelihood, score(candidate.frequency, candidate.stringDistance, docCount))); + candidates.add(new Candidate(BytesRef.deepCopyOf(result), candidate.frequency, nonErrorLikelihood, score(candidate.frequency, candidate.stringDistance, dictSize))); } } }, spare); } } - private double score(int frequency, double errorScore, int docCount) { - return errorScore * (((double)frequency + 1) / ((double)docCount +1)); + private double score(long frequency, double errorScore, long dictionarySize) { + return errorScore * (((double)frequency + 1) / ((double)dictionarySize +1)); } - protected int thresholdFrequency(int termFrequency, int docCount) { + protected long thresholdFrequency(long termFrequency, long dictionarySize) { if (termFrequency > 0) { - return (int) Math.round(termFrequency * (Math.log10(termFrequency - frequencyPlateau) * (1.0 / Math.log10(logBase))) + 1); + return (long) Math.round(termFrequency * (Math.log10(termFrequency - frequencyPlateau) * (1.0 / Math.log10(logBase))) + 1); } return 0; @@ -193,10 +209,10 @@ public final class DirectCandidateGenerator extends CandidateGenerator { public static final Candidate[] EMPTY = new Candidate[0]; public final BytesRef term; public final double stringDistance; - public final int frequency; + public final long frequency; public final double score; - public Candidate(BytesRef term, int frequency, double stringDistance, double score) { + public Candidate(BytesRef term, long frequency, double stringDistance, double score) { this.frequency = frequency; this.term = term; this.stringDistance = stringDistance; @@ -235,8 +251,8 @@ public final class DirectCandidateGenerator extends CandidateGenerator { } @Override - public Candidate createCandidate(BytesRef term, int frequency, double channelScore) throws IOException { - return new Candidate(term, frequency, channelScore, score(frequency, channelScore, docCount)); + public Candidate createCandidate(BytesRef term, long frequency, double channelScore) throws IOException { + return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize)); } } diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java b/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java index a9cd3d8b17d..d2973525678 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java @@ -42,23 +42,18 @@ public final class LaplaceScorer extends WordScorer { this.alpha = alpha; } - public double score(Candidate word, Candidate previousWord) throws IOException{ - SuggestUtils.join(separator, spare, previousWord.term, word.term); - return (alpha + frequency(spare)) / (alpha + previousWord.frequency); - } - @Override protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { SuggestUtils.join(separator, spare, w_1.term, word.term); - return (alpha + frequency(spare)) / (alpha + w_1.frequency); + return (alpha + frequency(spare)) / (alpha + w_1.frequency + vocabluarySize); } @Override protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException { SuggestUtils.join(separator, spare, w_2.term, w_1.term, word.term); - int trigramCount = frequency(spare); + long trigramCount = frequency(spare); SuggestUtils.join(separator, spare, w_1.term, word.term); - return (alpha + trigramCount) / (alpha + frequency(spare)); + return (alpha + trigramCount) / (alpha + frequency(spare) + vocabluarySize); } diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpoatingScorer.java b/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpoatingScorer.java index 452b765fbc3..0fda2d5b23a 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpoatingScorer.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpoatingScorer.java @@ -44,7 +44,7 @@ public final class LinearInterpoatingScorer extends WordScorer { @Override protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { SuggestUtils.join(separator, spare, w_1.term, word.term); - final int count = frequency(spare); + final long count = frequency(spare); if (count < 1) { return unigramLambda * scoreUnigram(word); } @@ -54,7 +54,7 @@ public final class LinearInterpoatingScorer extends WordScorer { @Override protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException { SuggestUtils.join(separator, spare, w.term, w_1.term, w_2.term); - final int count = frequency(spare); + final long count = frequency(spare); if (count < 1) { return scoreBigram(w, w_1); } diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java b/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java index 414dcb92ca6..f830cd3a81c 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java @@ -40,7 +40,7 @@ public final class MultiCandidateGeneratorWrapper extends CandidateGenerator { } @Override - public int frequency(BytesRef term) throws IOException { + public long frequency(BytesRef term) throws IOException { return candidateGenerator[0].frequency(term); } @@ -70,7 +70,7 @@ public final class MultiCandidateGeneratorWrapper extends CandidateGenerator { return set; } @Override - public Candidate createCandidate(BytesRef term, int frequency, double channelScore) throws IOException { + public Candidate createCandidate(BytesRef term, long frequency, double channelScore) throws IOException { return candidateGenerator[0].createCandidate(term, frequency, channelScore); } diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java b/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java index 70a7ac8527d..1527d69f27c 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java @@ -81,7 +81,7 @@ public final class NoisyChannelSpellChecker { anyUnigram = true; if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) { assert currentSet != null; - int freq = 0; + long freq = 0; if ((freq = generator.frequency(term)) > 0) { currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood)); } diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java b/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java index 4754334bfa8..86f4acd649f 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java @@ -44,7 +44,7 @@ public class StupidBackoffScorer extends WordScorer { @Override protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { SuggestUtils.join(separator, spare, w_1.term, word.term); - final int count = frequency(spare); + final long count = frequency(spare); if (count < 1) { return discount * scoreUnigram(word); } @@ -54,17 +54,17 @@ public class StupidBackoffScorer extends WordScorer { @Override protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException { SuggestUtils.join(separator, spare, w_2.term, w_1.term, w.term); - final int trigramCount = frequency(spare); + final long trigramCount = frequency(spare); if (trigramCount < 1) { SuggestUtils.join(separator, spare, w_1.term, w.term); - final int count = frequency(spare); + final long count = frequency(spare); if (count < 1) { return discount * scoreUnigram(w); } return discount * (count / (w_1.frequency + 0.00000000001d)); } SuggestUtils.join(separator, spare, w_1.term, w.term); - final int bigramCount = frequency(spare); + final long bigramCount = frequency(spare); return trigramCount / (bigramCount + 0.00000000001d); } diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java b/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java index 79640a80bbd..b0f9e128ac3 100644 --- a/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java +++ b/src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java @@ -25,7 +25,6 @@ import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.ElasticSearchException; import org.elasticsearch.ElasticSearchIllegalArgumentException; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate; import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet; @@ -35,11 +34,13 @@ public abstract class WordScorer { protected final IndexReader reader; protected final String field; protected final Terms terms; - protected final int totalDocuments; + protected final long vocabluarySize; protected double realWordLikelyhood; protected final BytesRef spare = new BytesRef(); protected final BytesRef separator; protected final TermsEnum termsEnum; + private final long numTerms; + private final boolean useTotalTermFreq; public WordScorer(IndexReader reader, String field, double realWordLikelyHood, BytesRef separator) throws IOException { this.field = field; @@ -47,17 +48,19 @@ public abstract class WordScorer { if (terms == null) { throw new ElasticSearchIllegalArgumentException("Field: [" + field + "] does not exist"); } - final int docCount = terms.getDocCount(); - this.totalDocuments = docCount == -1 ? reader.maxDoc() : docCount; + final long vocSize = terms.getSumTotalTermFreq(); + this.vocabluarySize = vocSize == -1 ? reader.maxDoc() : vocSize; + this.useTotalTermFreq = vocSize != -1; + this.numTerms = terms.size(); this.termsEnum = terms.iterator(null); this.reader = reader; this.realWordLikelyhood = realWordLikelyHood; this.separator = separator; } - public int frequency(BytesRef term) throws IOException { + public long frequency(BytesRef term) throws IOException { if (termsEnum.seekExact(term, true)) { - return termsEnum.docFreq(); + return useTotalTermFreq ? termsEnum.totalTermFreq() : termsEnum.docFreq(); } return 0; } @@ -80,7 +83,7 @@ public abstract class WordScorer { } protected double scoreUnigram(Candidate word) throws IOException { - return (1.0 + word.frequency) / (1.0 + totalDocuments); + return (1.0 + frequency(word.term)) / (vocabluarySize + numTerms); } protected double scoreBigram(Candidate word, Candidate w_1) throws IOException { diff --git a/src/test/java/org/elasticsearch/test/unit/search/suggest/phrase/NoisyChannelSpellCheckerTests.java b/src/test/java/org/elasticsearch/test/unit/search/suggest/phrase/NoisyChannelSpellCheckerTests.java index d1c1653a81f..1c9a4f996fc 100644 --- a/src/test/java/org/elasticsearch/test/unit/search/suggest/phrase/NoisyChannelSpellCheckerTests.java +++ b/src/test/java/org/elasticsearch/test/unit/search/suggest/phrase/NoisyChannelSpellCheckerTests.java @@ -123,15 +123,15 @@ public class NoisyChannelSpellCheckerTests { assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); - assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); - assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); + assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); + assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 5, 0.5f, 4, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); - assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); - assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); + assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); + assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); // test synonyms @@ -219,11 +219,11 @@ public class NoisyChannelSpellCheckerTests { NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); - DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95); - DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, wrapper, wrapper); + DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95); + DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, wrapper, wrapper); CandidateGenerator generator = new MultiCandidateGeneratorWrapper(forward, reverse); - Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 5, 1, 1, ir, "body", wordScorer, 1, 2); + Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 10, 1, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); @@ -241,9 +241,9 @@ public class NoisyChannelSpellCheckerTests { corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 5, 0.5f, 4, ir, "body", wordScorer, 0, 2); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); - assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); - assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel")); - assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel")); + assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel")); + assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel")); + assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("tarr the god jewel")); @@ -316,9 +316,9 @@ public class NoisyChannelSpellCheckerTests { corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 5, 0.5f, 4, ir, "body", wordScorer, 0, 3); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); - assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); - assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); - assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the gog jewel")); + assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); + assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); + assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); @@ -326,9 +326,9 @@ public class NoisyChannelSpellCheckerTests { corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 5, 0.5f, 4, ir, "body", wordScorer, 1, 3); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); - assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); - assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); - assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the gog jewel")); + assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); + assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); + assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 5, 0.5f, 1, ir, "body", wordScorer, 100, 3); @@ -362,20 +362,16 @@ public class NoisyChannelSpellCheckerTests { wordScorer = new LinearInterpoatingScorer(ir, "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 10, 2, 4, ir, "body", wordScorer, 1, 3); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); - assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("captain american")); - assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("captain ursa")); generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, null, analyzer); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 10, 2, 4, ir, "body", wordScorer, 1, 3); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); - assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("captain american")); - assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("captain usw")); wordScorer = new StupidBackoffScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.4); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 5, 0.5f, 2, ir, "body", wordScorer, 0, 3); assertThat(corrections.length, equalTo(2)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); - assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); + assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); } }