Moving join() helper function to WordScorer
This commit is contained in:
parent
cdc77648a1
commit
e6d57af0c5
|
@ -22,61 +22,18 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.search.spell.DirectSpellChecker;
|
||||
import org.apache.lucene.search.spell.SuggestWord;
|
||||
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
|
||||
import org.apache.lucene.search.spell.SuggestWordQueue;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
|
||||
public final class SuggestUtils {
|
||||
private static final Comparator<SuggestWord> LUCENE_FREQUENCY = new SuggestWordFrequencyComparator();
|
||||
private static final Comparator<SuggestWord> SCORE_COMPARATOR = SuggestWordQueue.DEFAULT_COMPARATOR;
|
||||
|
||||
private SuggestUtils() {
|
||||
// utils!!
|
||||
}
|
||||
|
||||
public static DirectSpellChecker getDirectSpellChecker(DirectSpellcheckerSettings suggestion) {
|
||||
DirectSpellChecker directSpellChecker = new DirectSpellChecker();
|
||||
directSpellChecker.setAccuracy(suggestion.accuracy());
|
||||
Comparator<SuggestWord> comparator;
|
||||
switch (suggestion.sort()) {
|
||||
case SCORE:
|
||||
comparator = SCORE_COMPARATOR;
|
||||
break;
|
||||
case FREQUENCY:
|
||||
comparator = LUCENE_FREQUENCY;
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Illegal suggest sort: " + suggestion.sort());
|
||||
}
|
||||
directSpellChecker.setComparator(comparator);
|
||||
directSpellChecker.setDistance(suggestion.stringDistance());
|
||||
directSpellChecker.setMaxEdits(suggestion.maxEdits());
|
||||
directSpellChecker.setMaxInspections(suggestion.maxInspections());
|
||||
directSpellChecker.setMaxQueryFrequency(suggestion.maxTermFreq());
|
||||
directSpellChecker.setMinPrefix(suggestion.prefixLength());
|
||||
directSpellChecker.setMinQueryLength(suggestion.minWordLength());
|
||||
directSpellChecker.setThresholdFrequency(suggestion.minDocFreq());
|
||||
directSpellChecker.setLowerCaseTerms(false);
|
||||
return directSpellChecker;
|
||||
}
|
||||
|
||||
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
|
||||
result.clear();
|
||||
for (int i = 0; i < toJoin.length - 1; i++) {
|
||||
result.append(toJoin[i]);
|
||||
result.append(separator);
|
||||
}
|
||||
result.append(toJoin[toJoin.length-1]);
|
||||
return result.get();
|
||||
}
|
||||
|
||||
public abstract static class TokenConsumer {
|
||||
protected CharTermAttribute charTermAttr;
|
||||
protected PositionIncrementAttribute posIncAttr;
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.elasticsearch.search.suggest.phrase;
|
|||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
@ -73,7 +72,7 @@ public final class Correction implements Comparable<Correction> {
|
|||
len += toJoin[i].length;
|
||||
}
|
||||
result.grow(len);
|
||||
return SuggestUtils.join(separator, result, toJoin);
|
||||
return WordScorer.join(separator, result, toJoin);
|
||||
}
|
||||
|
||||
/** Lower scores sorts first; if scores are equal,
|
||||
|
|
|
@ -449,7 +449,8 @@ public final class DirectCandidateGeneratorBuilder implements CandidateGenerator
|
|||
return new LuceneLevenshteinDistance();
|
||||
} else if ("levenstein".equals(distanceVal)) {
|
||||
return new LevensteinDistance();
|
||||
//TODO Jaro and Winkler are 2 people - so apply same naming logic as damerau_levenshtein
|
||||
// TODO Jaro and Winkler are 2 people - so apply same naming logic
|
||||
// as damerau_levenshtein
|
||||
} else if ("jarowinkler".equals(distanceVal)) {
|
||||
return new JaroWinklerDistance();
|
||||
} else if ("ngram".equals(distanceVal)) {
|
||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -41,15 +40,15 @@ final class LaplaceScorer extends WordScorer {
|
|||
|
||||
@Override
|
||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
||||
join(separator, spare, w_1.term, word.term);
|
||||
return (alpha + frequency(spare.get())) / (alpha + w_1.frequency + vocabluarySize);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
|
||||
SuggestUtils.join(separator, spare, w_2.term, w_1.term, word.term);
|
||||
join(separator, spare, w_2.term, w_1.term, word.term);
|
||||
long trigramCount = frequency(spare.get());
|
||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
||||
join(separator, spare, w_1.term, word.term);
|
||||
return (alpha + trigramCount) / (alpha + frequency(spare.get()) + vocabluarySize);
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -56,7 +55,7 @@ public final class LinearInterpolatingScorer extends WordScorer {
|
|||
|
||||
@Override
|
||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
||||
join(separator, spare, w_1.term, word.term);
|
||||
final long count = frequency(spare.get());
|
||||
if (count < 1) {
|
||||
return unigramLambda * scoreUnigram(word);
|
||||
|
@ -66,12 +65,12 @@ public final class LinearInterpolatingScorer extends WordScorer {
|
|||
|
||||
@Override
|
||||
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
||||
SuggestUtils.join(separator, spare, w.term, w_1.term, w_2.term);
|
||||
join(separator, spare, w.term, w_1.term, w_2.term);
|
||||
final long count = frequency(spare.get());
|
||||
if (count < 1) {
|
||||
return scoreBigram(w, w_1);
|
||||
}
|
||||
SuggestUtils.join(separator, spare, w.term, w_1.term);
|
||||
join(separator, spare, w.term, w_1.term);
|
||||
return trigramLambda * (count / (1.d + frequency(spare.get()))) + scoreBigram(w, w_1);
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -41,7 +40,7 @@ class StupidBackoffScorer extends WordScorer {
|
|||
|
||||
@Override
|
||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
||||
join(separator, spare, w_1.term, word.term);
|
||||
final long count = frequency(spare.get());
|
||||
if (count < 1) {
|
||||
return discount * scoreUnigram(word);
|
||||
|
@ -53,12 +52,12 @@ class StupidBackoffScorer extends WordScorer {
|
|||
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
||||
// First see if there are bigrams. If there aren't then skip looking up the trigram. This saves lookups
|
||||
// when the bigrams and trigrams are rare and we need both anyway.
|
||||
SuggestUtils.join(separator, spare, w_1.term, w.term);
|
||||
join(separator, spare, w_1.term, w.term);
|
||||
long bigramCount = frequency(spare.get());
|
||||
if (bigramCount < 1) {
|
||||
return discount * scoreUnigram(w);
|
||||
}
|
||||
SuggestUtils.join(separator, spare, w_2.term, w_1.term, w.term);
|
||||
join(separator, spare, w_2.term, w_1.term, w.term);
|
||||
long trigramCount = frequency(spare.get());
|
||||
if (trigramCount < 1) {
|
||||
return discount * (bigramCount / (w_1.frequency + 0.00000000001d));
|
||||
|
|
|
@ -100,6 +100,16 @@ public abstract class WordScorer {
|
|||
return scoreBigram(word, w_1);
|
||||
}
|
||||
|
||||
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
|
||||
result.clear();
|
||||
for (int i = 0; i < toJoin.length - 1; i++) {
|
||||
result.append(toJoin[i]);
|
||||
result.append(separator);
|
||||
}
|
||||
result.append(toJoin[toJoin.length-1]);
|
||||
return result.get();
|
||||
}
|
||||
|
||||
public interface WordScorerFactory {
|
||||
WordScorer newScorer(IndexReader reader, Terms terms,
|
||||
String field, double realWordLikelyhood, BytesRef separator) throws IOException;
|
||||
|
|
Loading…
Reference in New Issue