Moving join() helper function to WordScorer

This commit is contained in:
Christoph Büscher 2016-08-10 13:06:43 +02:00
parent cdc77648a1
commit e6d57af0c5
7 changed files with 22 additions and 58 deletions

View File

@ -22,61 +22,18 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
import org.apache.lucene.search.spell.SuggestWordQueue;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.common.ParseField;
import java.io.IOException;
import java.util.Comparator;
public final class SuggestUtils {
private static final Comparator<SuggestWord> LUCENE_FREQUENCY = new SuggestWordFrequencyComparator();
private static final Comparator<SuggestWord> SCORE_COMPARATOR = SuggestWordQueue.DEFAULT_COMPARATOR;
private SuggestUtils() {
// utils!!
}
public static DirectSpellChecker getDirectSpellChecker(DirectSpellcheckerSettings suggestion) {
DirectSpellChecker directSpellChecker = new DirectSpellChecker();
directSpellChecker.setAccuracy(suggestion.accuracy());
Comparator<SuggestWord> comparator;
switch (suggestion.sort()) {
case SCORE:
comparator = SCORE_COMPARATOR;
break;
case FREQUENCY:
comparator = LUCENE_FREQUENCY;
break;
default:
throw new IllegalArgumentException("Illegal suggest sort: " + suggestion.sort());
}
directSpellChecker.setComparator(comparator);
directSpellChecker.setDistance(suggestion.stringDistance());
directSpellChecker.setMaxEdits(suggestion.maxEdits());
directSpellChecker.setMaxInspections(suggestion.maxInspections());
directSpellChecker.setMaxQueryFrequency(suggestion.maxTermFreq());
directSpellChecker.setMinPrefix(suggestion.prefixLength());
directSpellChecker.setMinQueryLength(suggestion.minWordLength());
directSpellChecker.setThresholdFrequency(suggestion.minDocFreq());
directSpellChecker.setLowerCaseTerms(false);
return directSpellChecker;
}
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
result.clear();
for (int i = 0; i < toJoin.length - 1; i++) {
result.append(toJoin[i]);
result.append(separator);
}
result.append(toJoin[toJoin.length-1]);
return result.get();
}
public abstract static class TokenConsumer {
protected CharTermAttribute charTermAttr;
protected PositionIncrementAttribute posIncAttr;

View File

@ -20,7 +20,6 @@ package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import java.util.Arrays;
@ -73,7 +72,7 @@ public final class Correction implements Comparable<Correction> {
len += toJoin[i].length;
}
result.grow(len);
return SuggestUtils.join(separator, result, toJoin);
return WordScorer.join(separator, result, toJoin);
}
/** Lower scores sorts first; if scores are equal,

View File

@ -449,7 +449,8 @@ public final class DirectCandidateGeneratorBuilder implements CandidateGenerator
return new LuceneLevenshteinDistance();
} else if ("levenstein".equals(distanceVal)) {
return new LevensteinDistance();
//TODO Jaro and Winkler are 2 people - so apply same naming logic as damerau_levenshtein
// TODO Jaro and Winkler are 2 people - so apply same naming logic
// as damerau_levenshtein
} else if ("jarowinkler".equals(distanceVal)) {
return new JaroWinklerDistance();
} else if ("ngram".equals(distanceVal)) {

View File

@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import java.io.IOException;
@ -41,15 +40,15 @@ final class LaplaceScorer extends WordScorer {
@Override
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
SuggestUtils.join(separator, spare, w_1.term, word.term);
join(separator, spare, w_1.term, word.term);
return (alpha + frequency(spare.get())) / (alpha + w_1.frequency + vocabluarySize);
}
@Override
protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
SuggestUtils.join(separator, spare, w_2.term, w_1.term, word.term);
join(separator, spare, w_2.term, w_1.term, word.term);
long trigramCount = frequency(spare.get());
SuggestUtils.join(separator, spare, w_1.term, word.term);
join(separator, spare, w_1.term, word.term);
return (alpha + trigramCount) / (alpha + frequency(spare.get()) + vocabluarySize);
}

View File

@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import java.io.IOException;
@ -56,7 +55,7 @@ public final class LinearInterpolatingScorer extends WordScorer {
@Override
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
SuggestUtils.join(separator, spare, w_1.term, word.term);
join(separator, spare, w_1.term, word.term);
final long count = frequency(spare.get());
if (count < 1) {
return unigramLambda * scoreUnigram(word);
@ -66,12 +65,12 @@ public final class LinearInterpolatingScorer extends WordScorer {
@Override
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
SuggestUtils.join(separator, spare, w.term, w_1.term, w_2.term);
join(separator, spare, w.term, w_1.term, w_2.term);
final long count = frequency(spare.get());
if (count < 1) {
return scoreBigram(w, w_1);
}
SuggestUtils.join(separator, spare, w.term, w_1.term);
join(separator, spare, w.term, w_1.term);
return trigramLambda * (count / (1.d + frequency(spare.get()))) + scoreBigram(w, w_1);
}

View File

@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import java.io.IOException;
@ -41,7 +40,7 @@ class StupidBackoffScorer extends WordScorer {
@Override
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
SuggestUtils.join(separator, spare, w_1.term, word.term);
join(separator, spare, w_1.term, word.term);
final long count = frequency(spare.get());
if (count < 1) {
return discount * scoreUnigram(word);
@ -53,12 +52,12 @@ class StupidBackoffScorer extends WordScorer {
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
// First see if there are bigrams. If there aren't then skip looking up the trigram. This saves lookups
// when the bigrams and trigrams are rare and we need both anyway.
SuggestUtils.join(separator, spare, w_1.term, w.term);
join(separator, spare, w_1.term, w.term);
long bigramCount = frequency(spare.get());
if (bigramCount < 1) {
return discount * scoreUnigram(w);
}
SuggestUtils.join(separator, spare, w_2.term, w_1.term, w.term);
join(separator, spare, w_2.term, w_1.term, w.term);
long trigramCount = frequency(spare.get());
if (trigramCount < 1) {
return discount * (bigramCount / (w_1.frequency + 0.00000000001d));

View File

@ -100,6 +100,16 @@ public abstract class WordScorer {
return scoreBigram(word, w_1);
}
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
result.clear();
for (int i = 0; i < toJoin.length - 1; i++) {
result.append(toJoin[i]);
result.append(separator);
}
result.append(toJoin[toJoin.length-1]);
return result.get();
}
public interface WordScorerFactory {
WordScorer newScorer(IndexReader reader, Terms terms,
String field, double realWordLikelyhood, BytesRef separator) throws IOException;