Speed up phrase suggestion scoring
Two changes: 1. In the StupidBackoffScorer only look for the trigram if there is a bigram. 2. Cache the frequencies in WordScorer so we don't look them up again and again and again. This is implemented by wrapping the TermsEnum in a special purpose wrapper that really only works in context of the WordScorer. This provides a pretty substantial speedup when there are many candidates. Closes #5395
This commit is contained in:
parent
d88ac0a95a
commit
917c93d7ee
|
@ -54,18 +54,18 @@ public class StupidBackoffScorer extends WordScorer {
|
|||
|
||||
@Override
|
||||
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
||||
SuggestUtils.join(separator, spare, w_2.term, w_1.term, w.term);
|
||||
final long trigramCount = frequency(spare);
|
||||
if (trigramCount < 1) {
|
||||
SuggestUtils.join(separator, spare, w_1.term, w.term);
|
||||
final long count = frequency(spare);
|
||||
if (count < 1) {
|
||||
return discount * scoreUnigram(w);
|
||||
}
|
||||
return discount * (count / (w_1.frequency + 0.00000000001d));
|
||||
}
|
||||
// First see if there are bigrams. If there aren't then skip looking up the trigram. This saves lookups
|
||||
// when the bigrams and trigrams are rare and we need both anyway.
|
||||
SuggestUtils.join(separator, spare, w_1.term, w.term);
|
||||
final long bigramCount = frequency(spare);
|
||||
long bigramCount = frequency(spare);
|
||||
if (bigramCount < 1) {
|
||||
return discount * scoreUnigram(w);
|
||||
}
|
||||
SuggestUtils.join(separator, spare, w_2.term, w_1.term, w.term);
|
||||
long trigramCount = frequency(spare);
|
||||
if (trigramCount < 1) {
|
||||
return discount * (bigramCount / (w_1.frequency + 0.00000000001d));
|
||||
}
|
||||
return trigramCount / (bigramCount + 0.00000000001d);
|
||||
}
|
||||
|
||||
|
|
|
@ -18,10 +18,11 @@
|
|||
*/
|
||||
package org.elasticsearch.search.suggest.phrase;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import com.carrotsearch.hppc.ObjectObjectMap;
|
||||
import com.carrotsearch.hppc.ObjectObjectOpenHashMap;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.FilterAtomicReader.FilterTermsEnum;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
|
@ -35,17 +36,17 @@ public abstract class WordScorer {
|
|||
protected final String field;
|
||||
protected final Terms terms;
|
||||
protected final long vocabluarySize;
|
||||
protected double realWordLikelyhood;
|
||||
protected final double realWordLikelyhood;
|
||||
protected final BytesRef spare = new BytesRef();
|
||||
protected final BytesRef separator;
|
||||
protected final TermsEnum termsEnum;
|
||||
private final long numTerms;
|
||||
private final boolean useTotalTermFreq;
|
||||
|
||||
|
||||
public WordScorer(IndexReader reader, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
|
||||
this(reader, MultiFields.getTerms(reader, field), field, realWordLikelyHood, separator);
|
||||
}
|
||||
|
||||
|
||||
public WordScorer(IndexReader reader, Terms terms, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
|
||||
this.field = field;
|
||||
if (terms == null) {
|
||||
|
@ -56,19 +57,19 @@ public abstract class WordScorer {
|
|||
this.vocabluarySize = vocSize == -1 ? reader.maxDoc() : vocSize;
|
||||
this.useTotalTermFreq = vocSize != -1;
|
||||
this.numTerms = terms.size();
|
||||
this.termsEnum = terms.iterator(null);
|
||||
this.termsEnum = new FrequencyCachingTermsEnumWrapper(terms.iterator(null));
|
||||
this.reader = reader;
|
||||
this.realWordLikelyhood = realWordLikelyHood;
|
||||
this.separator = separator;
|
||||
}
|
||||
|
||||
public long frequency(BytesRef term) throws IOException {
|
||||
if (termsEnum.seekExact(term)) {
|
||||
return useTotalTermFreq ? termsEnum.totalTermFreq() : termsEnum.docFreq();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public long frequency(BytesRef term) throws IOException {
|
||||
if (termsEnum.seekExact(term)) {
|
||||
return useTotalTermFreq ? termsEnum.totalTermFreq() : termsEnum.docFreq();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
protected double channelScore(Candidate candidate, Candidate original) throws IOException {
|
||||
if (candidate.stringDistance == 1.0d) {
|
||||
return realWordLikelyhood;
|
||||
|
@ -82,7 +83,7 @@ public abstract class WordScorer {
|
|||
} else if (at == 1 || gramSize == 2) {
|
||||
return Math.log10(channelScore(path[at], candidateSet[at].originalTerm) * scoreBigram(path[at], path[at - 1]));
|
||||
} else {
|
||||
return Math.log10(channelScore(path[at], candidateSet[at].originalTerm) * scoreTrigram(path[at], path[at - 1], path[at - 2]));
|
||||
return Math.log10(channelScore(path[at], candidateSet[at].originalTerm) * scoreTrigram(path[at], path[at - 1], path[at - 2]));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -94,12 +95,93 @@ public abstract class WordScorer {
|
|||
return scoreUnigram(word);
|
||||
}
|
||||
|
||||
protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
|
||||
protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
|
||||
return scoreBigram(word, w_1);
|
||||
}
|
||||
|
||||
|
||||
public static interface WordScorerFactory {
|
||||
public WordScorer newScorer(IndexReader reader, Terms terms,
|
||||
String field, double realWordLikelyhood, BytesRef separator) throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* Terms enum wrapper that caches term frequencies in an effort to outright skip seeks. Only works with seekExact(BytesRef), not next or
|
||||
* not seekCeil. Because of this it really only makes sense in this context.
|
||||
*/
|
||||
private static class FrequencyCachingTermsEnumWrapper extends FilterTermsEnum {
|
||||
private ObjectObjectMap<BytesRef, CacheEntry> cache = new ObjectObjectOpenHashMap<BytesRef, CacheEntry>();
|
||||
/**
|
||||
* The last term that the called attempted to seek to.
|
||||
*/
|
||||
private CacheEntry last;
|
||||
|
||||
public FrequencyCachingTermsEnumWrapper(TermsEnum in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(BytesRef text) throws IOException {
|
||||
last = cache.get(text);
|
||||
if (last != null) {
|
||||
// This'll fail to work properly if the user seeks but doesn't check the frequency, causing us to cache it.
|
||||
// That is OK because WordScorer only seeks to check the frequency.
|
||||
return last.ttf != 0 || last.df != 0;
|
||||
}
|
||||
last = new CacheEntry();
|
||||
cache.put(BytesRef.deepCopyOf(text), last);
|
||||
if (in.seekExact(text)) {
|
||||
// Found so mark the term uncached.
|
||||
last.df = -1;
|
||||
last.ttf = -1;
|
||||
return true;
|
||||
}
|
||||
// Not found. The cache will default to 0 for the freqs, meaning not found.
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() throws IOException {
|
||||
if (last.ttf == -1) {
|
||||
last.ttf = in.totalTermFreq();
|
||||
}
|
||||
return last.ttf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docFreq() throws IOException {
|
||||
if (last.df == -1) {
|
||||
last.df = in.docFreq();
|
||||
}
|
||||
return last.df;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seekExact(long ord) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public SeekStatus seekCeil(BytesRef text) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
private static class CacheEntry {
|
||||
private long ttf;
|
||||
private int df;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,11 +20,13 @@
|
|||
package org.elasticsearch.search.suggest;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.io.Resources;
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.ElasticsearchIllegalStateException;
|
||||
import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder;
|
||||
import org.elasticsearch.action.index.IndexRequestBuilder;
|
||||
import org.elasticsearch.action.search.*;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
|
@ -930,6 +932,156 @@ public class SuggestSearchTests extends ElasticsearchIntegrationTest {
|
|||
.size(1));
|
||||
assertSuggestion(searchSuggest, 0, 0, "simple_phrase", "nobel prize");
|
||||
}
|
||||
|
||||
/**
|
||||
* If the suggester finds tons of options then picking the right one is slow without <<<INSERT SOLUTION HERE>>>.
|
||||
*/
|
||||
@Test
|
||||
public void suggestWithManyCandidates() throws InterruptedException, ExecutionException, IOException {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(settingsBuilder()
|
||||
.put(indexSettings())
|
||||
.put(SETTING_NUMBER_OF_SHARDS, 1) // A single shard will help to keep the tests repeatable.
|
||||
.put(SETTING_NUMBER_OF_REPLICAS, between(0, cluster().size() - 1))
|
||||
.put("index.analysis.analyzer.text.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.text.filter", "lowercase", "my_shingle")
|
||||
.put("index.analysis.filter.my_shingle.type", "shingle")
|
||||
.put("index.analysis.filter.my_shingle.output_unigrams", true)
|
||||
.put("index.analysis.filter.my_shingle.min_shingle_size", 2)
|
||||
.put("index.analysis.filter.my_shingle.max_shingle_size", 3));
|
||||
|
||||
XContentBuilder mapping = XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("type1")
|
||||
.startObject("properties")
|
||||
.startObject("title")
|
||||
.field("type", "string")
|
||||
.field("analyzer", "text")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject();
|
||||
assertAcked(builder.addMapping("type1", mapping));
|
||||
ensureGreen();
|
||||
|
||||
ImmutableList.Builder<String> titles = ImmutableList.<String>builder();
|
||||
|
||||
// We're going to be searching for:
|
||||
// united states house of representatives elections in washington 2006
|
||||
// But we need to make sure we generate a ton of suggestions so we add a bunch of candidates.
|
||||
// Many of these candidates are drawn from page names on English Wikipedia.
|
||||
|
||||
// Tons of different options very near the exact query term
|
||||
titles.add("United States House of Representatives Elections in Washington 1789");
|
||||
for (int year = 1790; year < 2014; year+= 2) {
|
||||
titles.add("United States House of Representatives Elections in Washington " + year);
|
||||
}
|
||||
// Six of these are near enough to be viable suggestions, just not the top one
|
||||
|
||||
// But we can't stop there! Titles that are just a year are pretty common so lets just add one per year
|
||||
// since 0. Why not?
|
||||
for (int year = 0; year < 2015; year++) {
|
||||
titles.add(Integer.toString(year));
|
||||
}
|
||||
// That ought to provide more less good candidates for the last term
|
||||
|
||||
// Now remove or add plural copies of every term we can
|
||||
titles.add("State");
|
||||
titles.add("Houses of Parliament");
|
||||
titles.add("Representative Government");
|
||||
titles.add("Election");
|
||||
|
||||
// Now some possessive
|
||||
titles.add("Washington's Birthday");
|
||||
|
||||
// And some conjugation
|
||||
titles.add("Unified Modeling Language");
|
||||
titles.add("Unite Against Fascism");
|
||||
titles.add("Stated Income Tax");
|
||||
titles.add("Media organizations housed within colleges");
|
||||
|
||||
// And other stuff
|
||||
titles.add("Untied shoelaces");
|
||||
titles.add("Unit circle");
|
||||
titles.add("Untitled");
|
||||
titles.add("Unicef");
|
||||
titles.add("Unrated");
|
||||
titles.add("UniRed");
|
||||
titles.add("Jalan Uniten–Dengkil"); // Highway in Malaysia
|
||||
titles.add("UNITAS");
|
||||
titles.add("UNITER");
|
||||
titles.add("Un-Led-Ed");
|
||||
titles.add("STATS LLC");
|
||||
titles.add("Staples");
|
||||
titles.add("Skates");
|
||||
titles.add("Statues of the Liberators");
|
||||
titles.add("Staten Island");
|
||||
titles.add("Statens Museum for Kunst");
|
||||
titles.add("Hause"); // The last name or the German word, whichever.
|
||||
titles.add("Hose");
|
||||
titles.add("Hoses");
|
||||
titles.add("Howse Peak");
|
||||
titles.add("The Hoose-Gow");
|
||||
titles.add("Hooser");
|
||||
titles.add("Electron");
|
||||
titles.add("Electors");
|
||||
titles.add("Evictions");
|
||||
titles.add("Coronal mass ejection");
|
||||
titles.add("Wasington"); // A film?
|
||||
titles.add("Warrington"); // A town in England
|
||||
titles.add("Waddington"); // Lots of places have this name
|
||||
titles.add("Watlington"); // Ditto
|
||||
titles.add("Waplington"); // Yup, also a town
|
||||
titles.add("Washing of the Spears"); // Book
|
||||
|
||||
for (char c = 'A'; c <= 'Z'; c++) {
|
||||
// Can't forget lists, glorious lists!
|
||||
titles.add("List of former members of the United States House of Representatives (" + c + ")");
|
||||
|
||||
// Lots of people are named Washington <Middle Initial>. LastName
|
||||
titles.add("Washington " + c + ". Lastname");
|
||||
|
||||
// Lets just add some more to be evil
|
||||
titles.add("United " + c);
|
||||
titles.add("States " + c);
|
||||
titles.add("House " + c);
|
||||
titles.add("Elections " + c);
|
||||
titles.add("2006 " + c);
|
||||
titles.add(c + " United");
|
||||
titles.add(c + " States");
|
||||
titles.add(c + " House");
|
||||
titles.add(c + " Elections");
|
||||
titles.add(c + " 2006");
|
||||
}
|
||||
|
||||
List<IndexRequestBuilder> builders = new ArrayList<IndexRequestBuilder>();
|
||||
for (String title: titles.build()) {
|
||||
builders.add(client().prepareIndex("test", "type1").setSource("title", title));
|
||||
}
|
||||
indexRandom(true, builders);
|
||||
|
||||
PhraseSuggestionBuilder suggest = phraseSuggestion("title")
|
||||
.field("title")
|
||||
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("title")
|
||||
.suggestMode("always")
|
||||
.maxTermFreq(.99f)
|
||||
.size(1000) // Setting a silly high size helps of generate a larger list of candidates for testing.
|
||||
.maxInspections(1000) // This too
|
||||
)
|
||||
.confidence(0f)
|
||||
.maxErrors(2f)
|
||||
.shardSize(30000)
|
||||
.size(30000);
|
||||
Suggest searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", suggest);
|
||||
assertSuggestion(searchSuggest, 0, 0, "title", "united states house of representatives elections in washington 2006");
|
||||
assertSuggestionSize(searchSuggest, 0, 25480, "title"); // Just to prove that we've run through a ton of options
|
||||
|
||||
suggest.size(1);
|
||||
long start = System.currentTimeMillis();
|
||||
searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", suggest);
|
||||
long total = System.currentTimeMillis() - start;
|
||||
assertSuggestion(searchSuggest, 0, 0, "title", "united states house of representatives elections in washington 2006");
|
||||
assertThat(total, lessThan(1000L)); // Takes many seconds without fix
|
||||
}
|
||||
|
||||
protected Suggest searchSuggest(SuggestionBuilder<?>... suggestion) {
|
||||
return searchSuggest(null, suggestion);
|
||||
|
|
Loading…
Reference in New Issue