Moving join() helper function to WordScorer
This commit is contained in:
parent
cdc77648a1
commit
e6d57af0c5
|
@ -22,61 +22,18 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.search.spell.DirectSpellChecker;
|
|
||||||
import org.apache.lucene.search.spell.SuggestWord;
|
|
||||||
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
|
|
||||||
import org.apache.lucene.search.spell.SuggestWordQueue;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.elasticsearch.common.ParseField;
|
import org.elasticsearch.common.ParseField;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Comparator;
|
|
||||||
|
|
||||||
public final class SuggestUtils {
|
public final class SuggestUtils {
|
||||||
private static final Comparator<SuggestWord> LUCENE_FREQUENCY = new SuggestWordFrequencyComparator();
|
|
||||||
private static final Comparator<SuggestWord> SCORE_COMPARATOR = SuggestWordQueue.DEFAULT_COMPARATOR;
|
|
||||||
|
|
||||||
private SuggestUtils() {
|
private SuggestUtils() {
|
||||||
// utils!!
|
// utils!!
|
||||||
}
|
}
|
||||||
|
|
||||||
public static DirectSpellChecker getDirectSpellChecker(DirectSpellcheckerSettings suggestion) {
|
|
||||||
DirectSpellChecker directSpellChecker = new DirectSpellChecker();
|
|
||||||
directSpellChecker.setAccuracy(suggestion.accuracy());
|
|
||||||
Comparator<SuggestWord> comparator;
|
|
||||||
switch (suggestion.sort()) {
|
|
||||||
case SCORE:
|
|
||||||
comparator = SCORE_COMPARATOR;
|
|
||||||
break;
|
|
||||||
case FREQUENCY:
|
|
||||||
comparator = LUCENE_FREQUENCY;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
throw new IllegalArgumentException("Illegal suggest sort: " + suggestion.sort());
|
|
||||||
}
|
|
||||||
directSpellChecker.setComparator(comparator);
|
|
||||||
directSpellChecker.setDistance(suggestion.stringDistance());
|
|
||||||
directSpellChecker.setMaxEdits(suggestion.maxEdits());
|
|
||||||
directSpellChecker.setMaxInspections(suggestion.maxInspections());
|
|
||||||
directSpellChecker.setMaxQueryFrequency(suggestion.maxTermFreq());
|
|
||||||
directSpellChecker.setMinPrefix(suggestion.prefixLength());
|
|
||||||
directSpellChecker.setMinQueryLength(suggestion.minWordLength());
|
|
||||||
directSpellChecker.setThresholdFrequency(suggestion.minDocFreq());
|
|
||||||
directSpellChecker.setLowerCaseTerms(false);
|
|
||||||
return directSpellChecker;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
|
|
||||||
result.clear();
|
|
||||||
for (int i = 0; i < toJoin.length - 1; i++) {
|
|
||||||
result.append(toJoin[i]);
|
|
||||||
result.append(separator);
|
|
||||||
}
|
|
||||||
result.append(toJoin[toJoin.length-1]);
|
|
||||||
return result.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
public abstract static class TokenConsumer {
|
public abstract static class TokenConsumer {
|
||||||
protected CharTermAttribute charTermAttr;
|
protected CharTermAttribute charTermAttr;
|
||||||
protected PositionIncrementAttribute posIncAttr;
|
protected PositionIncrementAttribute posIncAttr;
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.elasticsearch.search.suggest.phrase;
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -73,7 +72,7 @@ public final class Correction implements Comparable<Correction> {
|
||||||
len += toJoin[i].length;
|
len += toJoin[i].length;
|
||||||
}
|
}
|
||||||
result.grow(len);
|
result.grow(len);
|
||||||
return SuggestUtils.join(separator, result, toJoin);
|
return WordScorer.join(separator, result, toJoin);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Lower scores sorts first; if scores are equal,
|
/** Lower scores sorts first; if scores are equal,
|
||||||
|
|
|
@ -449,7 +449,8 @@ public final class DirectCandidateGeneratorBuilder implements CandidateGenerator
|
||||||
return new LuceneLevenshteinDistance();
|
return new LuceneLevenshteinDistance();
|
||||||
} else if ("levenstein".equals(distanceVal)) {
|
} else if ("levenstein".equals(distanceVal)) {
|
||||||
return new LevensteinDistance();
|
return new LevensteinDistance();
|
||||||
//TODO Jaro and Winkler are 2 people - so apply same naming logic as damerau_levenshtein
|
// TODO Jaro and Winkler are 2 people - so apply same naming logic
|
||||||
|
// as damerau_levenshtein
|
||||||
} else if ("jarowinkler".equals(distanceVal)) {
|
} else if ("jarowinkler".equals(distanceVal)) {
|
||||||
return new JaroWinklerDistance();
|
return new JaroWinklerDistance();
|
||||||
} else if ("ngram".equals(distanceVal)) {
|
} else if ("ngram".equals(distanceVal)) {
|
||||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -41,15 +40,15 @@ final class LaplaceScorer extends WordScorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
join(separator, spare, w_1.term, word.term);
|
||||||
return (alpha + frequency(spare.get())) / (alpha + w_1.frequency + vocabluarySize);
|
return (alpha + frequency(spare.get())) / (alpha + w_1.frequency + vocabluarySize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
|
protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
|
||||||
SuggestUtils.join(separator, spare, w_2.term, w_1.term, word.term);
|
join(separator, spare, w_2.term, w_1.term, word.term);
|
||||||
long trigramCount = frequency(spare.get());
|
long trigramCount = frequency(spare.get());
|
||||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
join(separator, spare, w_1.term, word.term);
|
||||||
return (alpha + trigramCount) / (alpha + frequency(spare.get()) + vocabluarySize);
|
return (alpha + trigramCount) / (alpha + frequency(spare.get()) + vocabluarySize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -56,7 +55,7 @@ public final class LinearInterpolatingScorer extends WordScorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
join(separator, spare, w_1.term, word.term);
|
||||||
final long count = frequency(spare.get());
|
final long count = frequency(spare.get());
|
||||||
if (count < 1) {
|
if (count < 1) {
|
||||||
return unigramLambda * scoreUnigram(word);
|
return unigramLambda * scoreUnigram(word);
|
||||||
|
@ -66,12 +65,12 @@ public final class LinearInterpolatingScorer extends WordScorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
||||||
SuggestUtils.join(separator, spare, w.term, w_1.term, w_2.term);
|
join(separator, spare, w.term, w_1.term, w_2.term);
|
||||||
final long count = frequency(spare.get());
|
final long count = frequency(spare.get());
|
||||||
if (count < 1) {
|
if (count < 1) {
|
||||||
return scoreBigram(w, w_1);
|
return scoreBigram(w, w_1);
|
||||||
}
|
}
|
||||||
SuggestUtils.join(separator, spare, w.term, w_1.term);
|
join(separator, spare, w.term, w_1.term);
|
||||||
return trigramLambda * (count / (1.d + frequency(spare.get()))) + scoreBigram(w, w_1);
|
return trigramLambda * (count / (1.d + frequency(spare.get()))) + scoreBigram(w, w_1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -41,7 +40,7 @@ class StupidBackoffScorer extends WordScorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
join(separator, spare, w_1.term, word.term);
|
||||||
final long count = frequency(spare.get());
|
final long count = frequency(spare.get());
|
||||||
if (count < 1) {
|
if (count < 1) {
|
||||||
return discount * scoreUnigram(word);
|
return discount * scoreUnigram(word);
|
||||||
|
@ -53,12 +52,12 @@ class StupidBackoffScorer extends WordScorer {
|
||||||
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
||||||
// First see if there are bigrams. If there aren't then skip looking up the trigram. This saves lookups
|
// First see if there are bigrams. If there aren't then skip looking up the trigram. This saves lookups
|
||||||
// when the bigrams and trigrams are rare and we need both anyway.
|
// when the bigrams and trigrams are rare and we need both anyway.
|
||||||
SuggestUtils.join(separator, spare, w_1.term, w.term);
|
join(separator, spare, w_1.term, w.term);
|
||||||
long bigramCount = frequency(spare.get());
|
long bigramCount = frequency(spare.get());
|
||||||
if (bigramCount < 1) {
|
if (bigramCount < 1) {
|
||||||
return discount * scoreUnigram(w);
|
return discount * scoreUnigram(w);
|
||||||
}
|
}
|
||||||
SuggestUtils.join(separator, spare, w_2.term, w_1.term, w.term);
|
join(separator, spare, w_2.term, w_1.term, w.term);
|
||||||
long trigramCount = frequency(spare.get());
|
long trigramCount = frequency(spare.get());
|
||||||
if (trigramCount < 1) {
|
if (trigramCount < 1) {
|
||||||
return discount * (bigramCount / (w_1.frequency + 0.00000000001d));
|
return discount * (bigramCount / (w_1.frequency + 0.00000000001d));
|
||||||
|
|
|
@ -100,6 +100,16 @@ public abstract class WordScorer {
|
||||||
return scoreBigram(word, w_1);
|
return scoreBigram(word, w_1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
|
||||||
|
result.clear();
|
||||||
|
for (int i = 0; i < toJoin.length - 1; i++) {
|
||||||
|
result.append(toJoin[i]);
|
||||||
|
result.append(separator);
|
||||||
|
}
|
||||||
|
result.append(toJoin[toJoin.length-1]);
|
||||||
|
return result.get();
|
||||||
|
}
|
||||||
|
|
||||||
public interface WordScorerFactory {
|
public interface WordScorerFactory {
|
||||||
WordScorer newScorer(IndexReader reader, Terms terms,
|
WordScorer newScorer(IndexReader reader, Terms terms,
|
||||||
String field, double realWordLikelyhood, BytesRef separator) throws IOException;
|
String field, double realWordLikelyhood, BytesRef separator) throws IOException;
|
||||||
|
|
Loading…
Reference in New Issue