mirror of https://github.com/apache/lucene.git
LUCENE-6339: Added Near-real time Document Suggester via custom postings format
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1669698 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f49f2b2af6
commit
b7adb6ac77
|
@ -19,6 +19,9 @@ New Features
|
|||
for counting ranges that align with the underlying terms as defined by the
|
||||
NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley)
|
||||
|
||||
* LUCENE-6339: Added Near-real time Document Suggester via custom postings format
|
||||
(Areek Zillur, Mike McCandless, Simon Willnauer)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-3312: The API of oal.document was restructured to
|
||||
|
|
|
@ -49,7 +49,7 @@ public class FSTUtil {
|
|||
public final FST.Arc<T> fstNode;
|
||||
|
||||
/** Output of the path so far: */
|
||||
T output;
|
||||
public final T output;
|
||||
|
||||
/** Input of the path so far: */
|
||||
public final IntsRefBuilder input;
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
|
||||
/**
|
||||
* {@link org.apache.lucene.search.suggest.document.CompletionPostingsFormat}
|
||||
* for {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat}
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Completion50PostingsFormat extends CompletionPostingsFormat {
|
||||
|
||||
/**
|
||||
* Sole Constructor
|
||||
*/
|
||||
public Completion50PostingsFormat() {
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PostingsFormat delegatePostingsFormat() {
|
||||
return PostingsFormat.forName("Lucene50");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,173 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.AnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenStreamToAutomaton;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
|
||||
/**
|
||||
* Wraps an {@link org.apache.lucene.analysis.Analyzer}
|
||||
* to provide additional completion-only tuning
|
||||
* (e.g. preserving token separators, preserving position increments while converting
|
||||
* a token stream to an automaton)
|
||||
* <p>
|
||||
* Can be used to index {@link SuggestField}
|
||||
* and as a query analyzer to {@link SuggestIndexSearcher}
|
||||
* <p>
|
||||
* NOTE: In most cases, index and query analyzer should have same values for {@link #preservePositionIncrements}
|
||||
* and {@link #preserveSep}
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class CompletionAnalyzer extends AnalyzerWrapper {
|
||||
|
||||
/**
|
||||
* Represents the separation between tokens, if
|
||||
* <code>preserveSep</code> is <code>true</code>
|
||||
* <p>
|
||||
* Same label is used as a delimiter in the {@link org.apache.lucene.search.suggest.document.CompletionTokenStream}
|
||||
* payload
|
||||
*/
|
||||
final static int SEP_LABEL = NRTSuggesterBuilder.PAYLOAD_SEP;
|
||||
|
||||
/**
|
||||
* Represent a hole character, inserted by {@link org.apache.lucene.analysis.TokenStreamToAutomaton}
|
||||
*/
|
||||
final static int HOLE_CHARACTER = TokenStreamToAutomaton.HOLE;
|
||||
|
||||
final static int DEFAULT_MAX_GRAPH_EXPANSIONS = -1;
|
||||
final static boolean DEFAULT_PRESERVE_SEP = true;
|
||||
final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
|
||||
|
||||
private final Analyzer analyzer;
|
||||
|
||||
/**
|
||||
* Preserve separation between tokens
|
||||
* when converting to an automaton
|
||||
* <p>
|
||||
* Defaults to <code>true</code>
|
||||
*/
|
||||
private final boolean preserveSep;
|
||||
|
||||
/**
|
||||
* Preserve position increments for tokens
|
||||
* when converting to an automaton
|
||||
* <p>
|
||||
* Defaults to <code>true</code>
|
||||
*/
|
||||
private final boolean preservePositionIncrements;
|
||||
|
||||
/**
|
||||
* Sets the maximum number of graph expansions of a completion automaton
|
||||
* <p>
|
||||
* Defaults to <code>-1</code> (no limit)
|
||||
*/
|
||||
private final int maxGraphExpansions;
|
||||
|
||||
/**
|
||||
* Wraps an analyzer to convert it's output token stream to an automaton
|
||||
*
|
||||
* @param analyzer token stream to be converted to an automaton
|
||||
* @param preserveSep Preserve separation between tokens when converting to an automaton
|
||||
* @param preservePositionIncrements Preserve position increments for tokens when converting to an automaton
|
||||
* @param maxGraphExpansions Sets the maximum number of graph expansions of a completion automaton
|
||||
*/
|
||||
public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
|
||||
super(PER_FIELD_REUSE_STRATEGY);
|
||||
this.analyzer = analyzer;
|
||||
this.preserveSep = preserveSep;
|
||||
this.preservePositionIncrements = preservePositionIncrements;
|
||||
this.maxGraphExpansions = maxGraphExpansions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
|
||||
* preserving token separation, position increments and no limit on graph expansions
|
||||
*/
|
||||
public CompletionAnalyzer(Analyzer analyzer) {
|
||||
this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
|
||||
* with no limit on graph expansions
|
||||
*/
|
||||
public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements) {
|
||||
this(analyzer, preserveSep, preservePositionIncrements, DEFAULT_MAX_GRAPH_EXPANSIONS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
|
||||
* preserving token separation and position increments
|
||||
*/
|
||||
public CompletionAnalyzer(Analyzer analyzer, int maxGraphExpansions) {
|
||||
this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
|
||||
CompletionTokenStream tokenStream = new CompletionTokenStream(components.getTokenStream(),
|
||||
preserveSep, preservePositionIncrements, SEP_LABEL, maxGraphExpansions);
|
||||
return new TokenStreamComponents(components.getTokenizer(), tokenStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts <code>key</code> to an automaton using
|
||||
* {@link #preservePositionIncrements}, {@link #preserveSep}
|
||||
* and {@link #maxGraphExpansions}
|
||||
*/
|
||||
public Automaton toAutomaton(String field, CharSequence key) throws IOException {
|
||||
for (int i = 0; i < key.length(); i++) {
|
||||
switch (key.charAt(i)) {
|
||||
case HOLE_CHARACTER:
|
||||
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
|
||||
case SEP_LABEL:
|
||||
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
try (TokenStream tokenStream = analyzer.tokenStream(field, key.toString())) {
|
||||
try(CompletionTokenStream stream = new CompletionTokenStream(tokenStream,
|
||||
preserveSep, preservePositionIncrements, SEP_LABEL, maxGraphExpansions)) {
|
||||
return stream.toAutomaton(tokenStream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,192 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.CODEC_NAME;
|
||||
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
|
||||
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
|
||||
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Weighted FSTs for any indexed {@link SuggestField} is built on {@link #write(Fields)}.
|
||||
* A weighted FST maps the analyzed forms of a field to its
|
||||
* surface form and document id. FSTs are stored in the CompletionDictionary (.lkp).
|
||||
* </p>
|
||||
* <p>
|
||||
* The file offsets of a field's FST are stored in the CompletionIndex (.cmp)
|
||||
* along with the field's internal number {@link FieldInfo#number} on {@link #close()}.
|
||||
* </p>
|
||||
*
|
||||
*/
|
||||
final class CompletionFieldsConsumer extends FieldsConsumer {
|
||||
|
||||
private final String delegatePostingsFormatName;
|
||||
private final Map<String, Long> seenFields = new HashMap<>();
|
||||
private final SegmentWriteState state;
|
||||
private IndexOutput dictOut;
|
||||
private FieldsConsumer delegateFieldsConsumer;
|
||||
|
||||
CompletionFieldsConsumer(PostingsFormat delegatePostingsFormat, SegmentWriteState state) throws IOException {
|
||||
this.delegatePostingsFormatName = delegatePostingsFormat.getName();
|
||||
this.state = state;
|
||||
String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
|
||||
boolean success = false;
|
||||
try {
|
||||
this.delegateFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
|
||||
dictOut = state.directory.createOutput(dictFile, state.context);
|
||||
CodecUtil.writeIndexHeader(dictOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
success = true;
|
||||
} finally {
|
||||
if (success == false) {
|
||||
IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Fields fields) throws IOException {
|
||||
delegateFieldsConsumer.write(fields);
|
||||
|
||||
for (String field : fields) {
|
||||
CompletionTermWriter termWriter = new CompletionTermWriter();
|
||||
Terms terms = fields.terms(field);
|
||||
TermsEnum termsEnum = terms.iterator(null);
|
||||
|
||||
// write terms
|
||||
BytesRef term;
|
||||
while ((term = termsEnum.next()) != null) {
|
||||
termWriter.write(term, termsEnum);
|
||||
}
|
||||
|
||||
// store lookup, if needed
|
||||
long filePointer = dictOut.getFilePointer();
|
||||
if (termWriter.finish(dictOut)) {
|
||||
seenFields.put(field, filePointer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean closed = false;
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (closed) {
|
||||
return;
|
||||
}
|
||||
closed = true;
|
||||
String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
|
||||
boolean success = false;
|
||||
try (IndexOutput indexOut = state.directory.createOutput(indexFile, state.context)) {
|
||||
delegateFieldsConsumer.close();
|
||||
CodecUtil.writeIndexHeader(indexOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
/*
|
||||
* we write the delegate postings format name so we can load it
|
||||
* without getting an instance in the ctor
|
||||
*/
|
||||
indexOut.writeString(delegatePostingsFormatName);
|
||||
// write # of seen fields
|
||||
indexOut.writeVInt(seenFields.size());
|
||||
// write field numbers and dictOut offsets
|
||||
for (Map.Entry<String, Long> seenField : seenFields.entrySet()) {
|
||||
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(seenField.getKey());
|
||||
indexOut.writeVInt(fieldInfo.number);
|
||||
indexOut.writeVLong(seenField.getValue());
|
||||
}
|
||||
CodecUtil.writeFooter(indexOut);
|
||||
CodecUtil.writeFooter(dictOut);
|
||||
IOUtils.close(dictOut);
|
||||
success = true;
|
||||
} finally {
|
||||
if (success == false) {
|
||||
IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// builds an FST based on the terms written
|
||||
private static class CompletionTermWriter {
|
||||
|
||||
private PostingsEnum postingsEnum = null;
|
||||
private int docCount = 0;
|
||||
|
||||
private final BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
private final NRTSuggesterBuilder builder;
|
||||
|
||||
public CompletionTermWriter() {
|
||||
builder = new NRTSuggesterBuilder();
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores the built FST in <code>output</code>
|
||||
* Returns true if there was anything stored, false otherwise
|
||||
*/
|
||||
public boolean finish(IndexOutput output) throws IOException {
|
||||
boolean stored = builder.store(output);
|
||||
assert stored || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
|
||||
return stored;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes all postings (surface form, weight, document id) for <code>term</code>
|
||||
*/
|
||||
public void write(BytesRef term, TermsEnum termsEnum) throws IOException {
|
||||
postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.PAYLOADS);
|
||||
builder.startTerm(term);
|
||||
int docFreq = 0;
|
||||
while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int docID = postingsEnum.docID();
|
||||
for (int i = 0; i < postingsEnum.freq(); i++) {
|
||||
postingsEnum.nextPosition();
|
||||
assert postingsEnum.getPayload() != null;
|
||||
BytesRef payload = postingsEnum.getPayload();
|
||||
ByteArrayDataInput input = new ByteArrayDataInput(payload.bytes, payload.offset, payload.length);
|
||||
int len = input.readVInt();
|
||||
scratch.grow(len);
|
||||
scratch.setLength(len);
|
||||
input.readBytes(scratch.bytes(), 0, scratch.length());
|
||||
builder.addEntry(docID, scratch.get(), input.readVLong() - 1);
|
||||
}
|
||||
docFreq++;
|
||||
docCount = Math.max(docCount, docFreq + 1);
|
||||
}
|
||||
builder.finishTerm();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,228 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FilterLeafReader;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.Accountables;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.CODEC_NAME;
|
||||
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_CODEC_VERSION;
|
||||
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
|
||||
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
|
||||
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Completion index (.cmp) is opened and read at instantiation to read in {@link SuggestField}
|
||||
* numbers and their FST offsets in the Completion dictionary (.lkp).
|
||||
* </p>
|
||||
* <p>
|
||||
* Completion dictionary (.lkp) is opened at instantiation and a field's FST is loaded
|
||||
* into memory the first time it is requested via {@link #terms(String)}.
|
||||
* </p>
|
||||
* <p>
|
||||
* NOTE: Only the footer is validated for Completion dictionary (.lkp) and not the checksum due
|
||||
* to random access pattern and checksum validation being too costly at instantiation
|
||||
* </p>
|
||||
*
|
||||
*/
|
||||
final class CompletionFieldsProducer extends FieldsProducer {
|
||||
|
||||
private FieldsProducer delegateFieldsProducer;
|
||||
private Map<String, CompletionsTermsReader> readers;
|
||||
private IndexInput dictIn;
|
||||
|
||||
// copy ctr for merge instance
|
||||
private CompletionFieldsProducer(FieldsProducer delegateFieldsProducer, Map<String, CompletionsTermsReader> readers) {
|
||||
this.delegateFieldsProducer = delegateFieldsProducer;
|
||||
this.readers = readers;
|
||||
}
|
||||
|
||||
CompletionFieldsProducer(SegmentReadState state) throws IOException {
|
||||
String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
|
||||
delegateFieldsProducer = null;
|
||||
boolean success = false;
|
||||
|
||||
try (ChecksumIndexInput index = state.directory.openChecksumInput(indexFile, state.context)) {
|
||||
// open up dict file containing all fsts
|
||||
String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
|
||||
dictIn = state.directory.openInput(dictFile, state.context);
|
||||
CodecUtil.checkIndexHeader(dictIn, CODEC_NAME, COMPLETION_CODEC_VERSION, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
// just validate the footer for the dictIn
|
||||
CodecUtil.retrieveChecksum(dictIn);
|
||||
|
||||
// open up index file (fieldNumber, offset)
|
||||
CodecUtil.checkIndexHeader(index, CODEC_NAME, COMPLETION_CODEC_VERSION, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||
// load delegate PF
|
||||
PostingsFormat delegatePostingsFormat = PostingsFormat.forName(index.readString());
|
||||
delegateFieldsProducer = delegatePostingsFormat.fieldsProducer(state);
|
||||
|
||||
// read suggest field numbers and their offsets in the terms file from index
|
||||
int numFields = index.readVInt();
|
||||
readers = new HashMap<>(numFields);
|
||||
for (int i = 0; i < numFields; i++) {
|
||||
int fieldNumber = index.readVInt();
|
||||
long offset = index.readVLong();
|
||||
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNumber);
|
||||
// we don't load the FST yet
|
||||
readers.put(fieldInfo.name, new CompletionsTermsReader(offset));
|
||||
}
|
||||
CodecUtil.checkFooter(index);
|
||||
success = true;
|
||||
} finally {
|
||||
if (success == false) {
|
||||
IOUtils.closeWhileHandlingException(delegateFieldsProducer, dictIn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
delegateFieldsProducer.close();
|
||||
IOUtils.close(dictIn);
|
||||
success = true;
|
||||
} finally {
|
||||
if (success == false) {
|
||||
IOUtils.closeWhileHandlingException(delegateFieldsProducer, dictIn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void checkIntegrity() throws IOException {
|
||||
delegateFieldsProducer.checkIntegrity();
|
||||
// TODO: checkIntegrity should checksum the dictionary and index
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer getMergeInstance() throws IOException {
|
||||
return new CompletionFieldsProducer(delegateFieldsProducer, readers);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
long ramBytesUsed = delegateFieldsProducer.ramBytesUsed();
|
||||
for (CompletionsTermsReader reader : readers.values()) {
|
||||
ramBytesUsed += reader.ramBytesUsed();
|
||||
}
|
||||
return ramBytesUsed;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Accountable> getChildResources() {
|
||||
List<Accountable> accountableList = new ArrayList<>();
|
||||
for (Map.Entry<String, CompletionsTermsReader> readerEntry : readers.entrySet()) {
|
||||
accountableList.add(Accountables.namedAccountable(readerEntry.getKey(), readerEntry.getValue()));
|
||||
}
|
||||
return Collections.unmodifiableCollection(accountableList);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
return readers.keySet().iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms(String field) throws IOException {
|
||||
return new CompletionTerms(delegateFieldsProducer.terms(field), readers.get(field));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return readers.size();
|
||||
}
|
||||
|
||||
private class CompletionsTermsReader implements Accountable {
|
||||
private final long offset;
|
||||
private NRTSuggester suggester;
|
||||
|
||||
public CompletionsTermsReader(long offset) throws IOException {
|
||||
assert offset >= 0l && offset < dictIn.length();
|
||||
this.offset = offset;
|
||||
}
|
||||
|
||||
public synchronized NRTSuggester suggester() throws IOException {
|
||||
if (suggester == null) {
|
||||
try (IndexInput dictClone = dictIn.clone()) { // let multiple fields load concurrently
|
||||
dictClone.seek(offset);
|
||||
suggester = NRTSuggester.load(dictClone);
|
||||
}
|
||||
}
|
||||
return suggester;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return (suggester != null) ? suggester.ramBytesUsed() : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Accountable> getChildResources() {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Thin wrapper over {@link org.apache.lucene.index.Terms} with
|
||||
* a {@link NRTSuggester}
|
||||
*/
|
||||
public static class CompletionTerms extends FilterLeafReader.FilterTerms {
|
||||
|
||||
private final CompletionsTermsReader reader;
|
||||
|
||||
public CompletionTerms(Terms in, CompletionsTermsReader reader) {
|
||||
super(in);
|
||||
this.reader = reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link NRTSuggester} for the field
|
||||
* or <code>null</code> if no FST
|
||||
* was indexed for this field
|
||||
*/
|
||||
public NRTSuggester suggester() throws IOException {
|
||||
if (reader == null) {
|
||||
return null;
|
||||
}
|
||||
return reader.suggester();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A {@link PostingsFormat} which supports document suggestion based on
|
||||
* indexed {@link SuggestField}s.
|
||||
* Document suggestion is based on an weighted FST which map analyzed
|
||||
* terms of a {@link SuggestField} to its surface form and document id.
|
||||
* </p>
|
||||
* <p>
|
||||
* Files:
|
||||
* <ul>
|
||||
* <li><tt>.lkp</tt>: <a href="#Completiondictionary">Completion Dictionary</a></li>
|
||||
* <li><tt>.cmp</tt>: <a href="#Completionindex">Completion Index</a></li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* <a name="Completionictionary"></a>
|
||||
* <h3>Completion Dictionary</h3>
|
||||
* <p>The .lkp file contains an FST for each suggest field
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>CompletionDict (.lkp) --> Header, FST<sup>NumSuggestFields</sup>, Footer</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <!-- TODO: should the FST output be mentioned at all? -->
|
||||
* <li>FST --> {@link FST FST<Long, BytesRef>}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <ul>
|
||||
* <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
|
||||
* for the Completion implementation.</li>
|
||||
* <li>FST maps all analyzed forms to surface forms of a SuggestField</li>
|
||||
* </ul>
|
||||
* <a name="Completionindex"></a>
|
||||
* <h3>Completion Index</h3>
|
||||
* <p>The .cmp file contains an index into the completion dictionary, so that it can be
|
||||
* accessed randomly.</p>
|
||||
* <ul>
|
||||
* <li>CompletionIndex (.cmp) --> Header, NumSuggestFields, Entry<sup>NumSuggestFields</sup>, Footer</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>NumSuggestFields --> {@link DataOutput#writeVInt Uint32}</li>
|
||||
* <li>Entry --> FieldNumber, CompletionDictionaryOffset</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeVInt Uint32}</li>
|
||||
* <li>CompletionDictionaryOffset --> {@link DataOutput#writeVLong Uint64}</li>
|
||||
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <ul>
|
||||
* <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
|
||||
* for the Completion implementation.</li>
|
||||
* <li>NumSuggestFields is the number of suggest fields indexed</li>
|
||||
* <li>FieldNumber is the fields number from {@link FieldInfos}. (.fnm)</li>
|
||||
* <li>CompletionDictionaryOffset is the file offset of a field's FST in CompletionDictionary (.lkp)</li>
|
||||
* </ul>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class CompletionPostingsFormat extends PostingsFormat {
|
||||
|
||||
static final String CODEC_NAME = "completion";
|
||||
static final int COMPLETION_CODEC_VERSION = 1;
|
||||
static final int COMPLETION_VERSION_CURRENT = COMPLETION_CODEC_VERSION;
|
||||
static final String INDEX_EXTENSION = "cmp";
|
||||
static final String DICT_EXTENSION = "lkp";
|
||||
|
||||
/**
|
||||
* Used only by core Lucene at read-time via Service Provider instantiation
|
||||
*/
|
||||
public CompletionPostingsFormat() {
|
||||
super(CODEC_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
* Concrete implementation should specify the delegating postings format
|
||||
*/
|
||||
protected abstract PostingsFormat delegatePostingsFormat();
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsFormat delegatePostingsFormat = delegatePostingsFormat();
|
||||
if (delegatePostingsFormat == null) {
|
||||
throw new UnsupportedOperationException("Error - " + getClass().getName()
|
||||
+ " has been constructed without a choice of PostingsFormat");
|
||||
}
|
||||
return new CompletionFieldsConsumer(delegatePostingsFormat, state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
return new CompletionFieldsProducer(state);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,358 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.TokenStreamToAutomaton;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS;
|
||||
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS;
|
||||
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP;
|
||||
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL;
|
||||
|
||||
/**
|
||||
* Token stream which converts a provided token stream to an automaton.
|
||||
* The accepted strings enumeration from the automaton are available through the
|
||||
* {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
|
||||
* The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
|
||||
* a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)})
|
||||
*
|
||||
*/
|
||||
final class CompletionTokenStream extends TokenStream {
|
||||
|
||||
private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);
|
||||
private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
|
||||
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
|
||||
|
||||
private final TokenStream input;
|
||||
private final boolean preserveSep;
|
||||
private final boolean preservePositionIncrements;
|
||||
private final int sepLabel;
|
||||
private final int maxGraphExpansions;
|
||||
|
||||
private BytesRef payload;
|
||||
private Iterator<IntsRef> finiteStrings;
|
||||
private int posInc = -1;
|
||||
private CharTermAttribute charTermAttribute;
|
||||
|
||||
/**
|
||||
* Creates a token stream to convert <code>input</code> to a token stream
|
||||
* of accepted strings by its automaton.
|
||||
* <p>
|
||||
* The token stream <code>input</code> is converted to an automaton
|
||||
* with the default settings of {@link org.apache.lucene.search.suggest.document.CompletionAnalyzer}
|
||||
*/
|
||||
public CompletionTokenStream(TokenStream input) {
|
||||
this(input, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, SEP_LABEL, DEFAULT_MAX_GRAPH_EXPANSIONS);
|
||||
}
|
||||
|
||||
CompletionTokenStream(TokenStream input, boolean preserveSep, boolean preservePositionIncrements, int sepLabel, int maxGraphExpansions) {
|
||||
// Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
|
||||
// the input stream entirely in toFiniteStrings(input)
|
||||
this.input = input;
|
||||
this.preserveSep = preserveSep;
|
||||
this.preservePositionIncrements = preservePositionIncrements;
|
||||
this.sepLabel = sepLabel;
|
||||
this.maxGraphExpansions = maxGraphExpansions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a separator label that is reserved for the payload
|
||||
* in {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)}
|
||||
*/
|
||||
public int sepLabel() {
|
||||
return sepLabel;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets a payload available throughout successive token stream enumeration
|
||||
*/
|
||||
public void setPayload(BytesRef payload) {
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
if (finiteStrings == null) {
|
||||
//TODO: make this return a Iterator<IntsRef> instead?
|
||||
Automaton automaton = toAutomaton(input);
|
||||
Set<IntsRef> strings = Operations.getFiniteStrings(automaton, maxGraphExpansions);
|
||||
|
||||
posInc = strings.size();
|
||||
finiteStrings = strings.iterator();
|
||||
}
|
||||
if (finiteStrings.hasNext()) {
|
||||
posAttr.setPositionIncrement(posInc);
|
||||
/*
|
||||
* this posInc encodes the number of paths that this surface form
|
||||
* produced. Multi Fields have the same surface form and therefore sum up
|
||||
*/
|
||||
posInc = 0;
|
||||
Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8
|
||||
if (charTermAttribute != null) {
|
||||
charTermAttribute.setLength(0);
|
||||
charTermAttribute.append(bytesAtt.toUTF16());
|
||||
}
|
||||
if (payload != null) {
|
||||
payloadAttr.setPayload(this.payload);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
if (posInc == -1) {
|
||||
input.end();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (posInc == -1) {
|
||||
input.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
if (hasAttribute(CharTermAttribute.class)) {
|
||||
// we only create this if we really need it to safe the UTF-8 to UTF-16 conversion
|
||||
charTermAttribute = getAttribute(CharTermAttribute.class);
|
||||
}
|
||||
finiteStrings = null;
|
||||
posInc = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts <code>tokenStream</code> to an automaton
|
||||
*/
|
||||
public Automaton toAutomaton(TokenStream tokenStream) throws IOException {
|
||||
// TODO refactor this
|
||||
// maybe we could hook up a modified automaton from TermAutomatonQuery here?
|
||||
Automaton automaton = null;
|
||||
try {
|
||||
// Create corresponding automaton: labels are bytes
|
||||
// from each analyzed token, with byte 0 used as
|
||||
// separator between tokens:
|
||||
final TokenStreamToAutomaton tsta;
|
||||
if (preserveSep) {
|
||||
tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL);
|
||||
} else {
|
||||
// When we're not preserving sep, we don't steal 0xff
|
||||
// byte, so we don't need to do any escaping:
|
||||
tsta = new TokenStreamToAutomaton();
|
||||
}
|
||||
tsta.setPreservePositionIncrements(preservePositionIncrements);
|
||||
|
||||
automaton = tsta.toAutomaton(tokenStream);
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(tokenStream);
|
||||
}
|
||||
|
||||
// TODO: we can optimize this somewhat by determinizing
|
||||
// while we convert
|
||||
automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
|
||||
// This automaton should not blow up during determinize:
|
||||
return Operations.determinize(automaton, maxGraphExpansions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Just escapes the 0xff byte (which we still for SEP).
|
||||
*/
|
||||
private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
|
||||
|
||||
final BytesRefBuilder spare = new BytesRefBuilder();
|
||||
private char sepLabel;
|
||||
|
||||
public EscapingTokenStreamToAutomaton(char sepLabel) {
|
||||
this.sepLabel = sepLabel;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BytesRef changeToken(BytesRef in) {
|
||||
int upto = 0;
|
||||
for (int i = 0; i < in.length; i++) {
|
||||
byte b = in.bytes[in.offset + i];
|
||||
if (b == (byte) sepLabel) {
|
||||
spare.grow(upto + 2);
|
||||
spare.setByteAt(upto++, (byte) sepLabel);
|
||||
spare.setByteAt(upto++, b);
|
||||
} else {
|
||||
spare.grow(upto + 1);
|
||||
spare.setByteAt(upto++, b);
|
||||
}
|
||||
}
|
||||
spare.setLength(upto);
|
||||
return spare.get();
|
||||
}
|
||||
}
|
||||
|
||||
// Replaces SEP with epsilon or remaps them if
|
||||
// we were asked to preserve them:
|
||||
private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
|
||||
|
||||
Automaton result = new Automaton();
|
||||
|
||||
// Copy all states over
|
||||
int numStates = a.getNumStates();
|
||||
for (int s = 0; s < numStates; s++) {
|
||||
result.createState();
|
||||
result.setAccept(s, a.isAccept(s));
|
||||
}
|
||||
|
||||
// Go in reverse topo sort so we know we only have to
|
||||
// make one pass:
|
||||
Transition t = new Transition();
|
||||
int[] topoSortStates = topoSortStates(a);
|
||||
for (int i = 0; i < topoSortStates.length; i++) {
|
||||
int state = topoSortStates[topoSortStates.length - 1 - i];
|
||||
int count = a.initTransition(state, t);
|
||||
for (int j = 0; j < count; j++) {
|
||||
a.getNextTransition(t);
|
||||
if (t.min == TokenStreamToAutomaton.POS_SEP) {
|
||||
assert t.max == TokenStreamToAutomaton.POS_SEP;
|
||||
if (preserveSep) {
|
||||
// Remap to SEP_LABEL:
|
||||
result.addTransition(state, t.dest, sepLabel);
|
||||
} else {
|
||||
result.addEpsilon(state, t.dest);
|
||||
}
|
||||
} else if (t.min == TokenStreamToAutomaton.HOLE) {
|
||||
assert t.max == TokenStreamToAutomaton.HOLE;
|
||||
|
||||
// Just remove the hole: there will then be two
|
||||
// SEP tokens next to each other, which will only
|
||||
// match another hole at search time. Note that
|
||||
// it will also match an empty-string token ... if
|
||||
// that's somehow a problem we can always map HOLE
|
||||
// to a dedicated byte (and escape it in the
|
||||
// input).
|
||||
result.addEpsilon(state, t.dest);
|
||||
} else {
|
||||
result.addTransition(state, t.dest, t.min, t.max);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result.finishState();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int[] topoSortStates(Automaton a) {
|
||||
int[] states = new int[a.getNumStates()];
|
||||
final Set<Integer> visited = new HashSet<>();
|
||||
final LinkedList<Integer> worklist = new LinkedList<>();
|
||||
worklist.add(0);
|
||||
visited.add(0);
|
||||
int upto = 0;
|
||||
states[upto] = 0;
|
||||
upto++;
|
||||
Transition t = new Transition();
|
||||
while (worklist.size() > 0) {
|
||||
int s = worklist.removeFirst();
|
||||
int count = a.initTransition(s, t);
|
||||
for (int i = 0; i < count; i++) {
|
||||
a.getNextTransition(t);
|
||||
if (!visited.contains(t.dest)) {
|
||||
visited.add(t.dest);
|
||||
worklist.add(t.dest);
|
||||
states[upto++] = t.dest;
|
||||
}
|
||||
}
|
||||
}
|
||||
return states;
|
||||
}
|
||||
|
||||
public interface ByteTermAttribute extends TermToBytesRefAttribute {
|
||||
// marker interface
|
||||
|
||||
/**
|
||||
* Return the builder from which the term is derived.
|
||||
*/
|
||||
public BytesRefBuilder builder();
|
||||
|
||||
public CharSequence toUTF16();
|
||||
}
|
||||
|
||||
public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute {
|
||||
private final BytesRefBuilder bytes = new BytesRefBuilder();
|
||||
private CharsRefBuilder charsRef;
|
||||
|
||||
@Override
|
||||
public void fillBytesRef() {
|
||||
// does nothing - we change in place
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRefBuilder builder() {
|
||||
return bytes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getBytesRef() {
|
||||
return bytes.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
bytes.clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
|
||||
other.bytes.copyBytes(bytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharSequence toUTF16() {
|
||||
if (charsRef == null) {
|
||||
charsRef = new CharsRefBuilder();
|
||||
}
|
||||
charsRef.copyUTF8Bytes(getBytesRef());
|
||||
return charsRef.get();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,324 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.search.CollectionTerminatedException;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.suggest.analyzing.FSTUtil;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PairOutputs;
|
||||
import org.apache.lucene.util.fst.PairOutputs.Pair;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseDocID;
|
||||
import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseSurfaceForm;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* NRTSuggester returns Top N completions with corresponding documents matching a provided automaton.
|
||||
* The completions are returned in descending order of their corresponding weight.
|
||||
* Deleted documents are filtered out in near real time using the provided reader.
|
||||
* A {@link org.apache.lucene.search.DocIdSet} can be passed in at query time to filter out documents.
|
||||
* </p>
|
||||
* <p>
|
||||
* See {@link #lookup(LeafReader, Automaton, int, DocIdSet, TopSuggestDocsCollector)} for more implementation
|
||||
* details.
|
||||
* <p>
|
||||
* Builder: {@link NRTSuggesterBuilder}
|
||||
* </p>
|
||||
* <p>
|
||||
* FST Format:
|
||||
* <ul>
|
||||
* <li>Input: analyzed forms of input terms</li>
|
||||
* <li>Output: Pair<Long, BytesRef> containing weight, surface form and docID</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* NOTE:
|
||||
* <ul>
|
||||
* <li>currently only {@link org.apache.lucene.search.DocIdSet} with random access capabilities are supported.</li>
|
||||
* <li>having too many deletions or using a very restrictive filter can make the search inadmissible due to
|
||||
* over-pruning of potential paths</li>
|
||||
* <li>when a {@link org.apache.lucene.search.DocIdSet} is used, it is assumed that the filter will roughly
|
||||
* filter out half the number of documents that match the provided automaton</li>
|
||||
* <li>lookup performance will degrade as more accepted completions lead to filtered out documents</li>
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
final class NRTSuggester implements Accountable {
|
||||
|
||||
/**
|
||||
* FST<Weight,Surface>:
|
||||
* input is the analyzed form, with a null byte between terms
|
||||
* and a {@link NRTSuggesterBuilder#END_BYTE} to denote the
|
||||
* end of the input
|
||||
* weight is a long
|
||||
* surface is the original, unanalyzed form followed by the docID
|
||||
*/
|
||||
private final FST<Pair<Long, BytesRef>> fst;
|
||||
|
||||
/**
|
||||
* Highest number of analyzed paths we saw for any single
|
||||
* input surface form. This can be > 1, when index analyzer
|
||||
* creates graphs or if multiple surface form(s) yields the
|
||||
* same analyzed form
|
||||
*/
|
||||
private final int maxAnalyzedPathsPerOutput;
|
||||
|
||||
/**
|
||||
* Separator used between surface form and its docID in the FST output
|
||||
*/
|
||||
private final int payloadSep;
|
||||
|
||||
/**
|
||||
* Label used to denote the end of an input in the FST and
|
||||
* the beginning of dedup bytes
|
||||
*/
|
||||
private final int endByte;
|
||||
|
||||
/**
|
||||
* Maximum queue depth for TopNSearcher
|
||||
*
|
||||
* NOTE: value should be <= Integer.MAX_VALUE
|
||||
*/
|
||||
private static final long MAX_TOP_N_QUEUE_SIZE = 1000;
|
||||
|
||||
private NRTSuggester(FST<Pair<Long, BytesRef>> fst, int maxAnalyzedPathsPerOutput, int payloadSep, int endByte) {
|
||||
this.fst = fst;
|
||||
this.maxAnalyzedPathsPerOutput = maxAnalyzedPathsPerOutput;
|
||||
this.payloadSep = payloadSep;
|
||||
this.endByte = endByte;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return fst == null ? 0 : fst.ramBytesUsed();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Accountable> getChildResources() {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
private static Comparator<Pair<Long, BytesRef>> getComparator() {
|
||||
return new Comparator<Pair<Long, BytesRef>>() {
|
||||
@Override
|
||||
public int compare(Pair<Long, BytesRef> o1, Pair<Long, BytesRef> o2) {
|
||||
return Long.compare(o1.output1, o2.output1);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects at most Top <code>num</code> completions, filtered by <code>filter</code> on
|
||||
* corresponding documents, which has a prefix accepted by <code>automaton</code>
|
||||
* <p>
|
||||
* Supports near real time deleted document filtering using <code>reader</code>
|
||||
* <p>
|
||||
* {@link TopSuggestDocsCollector#collect(int, CharSequence, long)} is called
|
||||
* for every matched completion
|
||||
* <p>
|
||||
* Completion collection can be early terminated by throwing {@link org.apache.lucene.search.CollectionTerminatedException}
|
||||
*/
|
||||
public void lookup(final LeafReader reader, final Automaton automaton, final int num, final DocIdSet filter, final TopSuggestDocsCollector collector) {
|
||||
final Bits filterDocs;
|
||||
try {
|
||||
if (filter != null) {
|
||||
if (filter.iterator() == null) {
|
||||
return;
|
||||
}
|
||||
if (filter.bits() == null) {
|
||||
throw new IllegalArgumentException("DocIDSet does not provide random access interface");
|
||||
} else {
|
||||
filterDocs = filter.bits();
|
||||
}
|
||||
} else {
|
||||
filterDocs = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
int queueSize = getMaxTopNSearcherQueueSize(num, reader, filterDocs != null);
|
||||
if (queueSize == -1) {
|
||||
return;
|
||||
}
|
||||
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
try {
|
||||
final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst);
|
||||
Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num, queueSize, getComparator()) {
|
||||
|
||||
private final CharsRefBuilder spare = new CharsRefBuilder();
|
||||
|
||||
@Override
|
||||
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
|
||||
int payloadSepIndex = parseSurfaceForm(output.output2, payloadSep, spare);
|
||||
int docID = parseDocID(output.output2, payloadSepIndex);
|
||||
|
||||
// filter out deleted docs only if no filter is set
|
||||
if (filterDocs == null && liveDocs != null && !liveDocs.get(docID)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// filter by filter context
|
||||
if (filterDocs != null && !filterDocs.get(docID)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
collector.collect(docID, spare.toCharsRef(), decode(output.output1));
|
||||
return true;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: add fuzzy support
|
||||
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
|
||||
searcher.addStartPaths(path.fstNode, path.output, false, path.input);
|
||||
}
|
||||
|
||||
try {
|
||||
// hits are also returned by search()
|
||||
// we do not use it, instead collect at acceptResult
|
||||
Util.TopResults<Pair<Long, BytesRef>> search = searcher.search();
|
||||
// search admissibility is not guaranteed
|
||||
// see comment on getMaxTopNSearcherQueueSize
|
||||
// assert search.isComplete;
|
||||
} catch (CollectionTerminatedException e) {
|
||||
// terminate
|
||||
}
|
||||
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple heuristics to try to avoid over-pruning potential suggestions by the
|
||||
* TopNSearcher. Since suggestion entries can be rejected if they belong
|
||||
* to a deleted document, the length of the TopNSearcher queue has to
|
||||
* be increased by some factor, to account for the filtered out suggestions.
|
||||
* This heuristic will try to make the searcher admissible, but the search
|
||||
* can still lead to over-pruning
|
||||
* <p>
|
||||
* If a <code>filter</code> is applied, the queue size is increased by
|
||||
* half the number of live documents.
|
||||
* <p>
|
||||
* The maximum queue size is {@link #MAX_TOP_N_QUEUE_SIZE}
|
||||
*/
|
||||
private int getMaxTopNSearcherQueueSize(int num, LeafReader reader, boolean filterEnabled) {
|
||||
double liveDocsRatio = calculateLiveDocRatio(reader.numDocs(), reader.maxDoc());
|
||||
if (liveDocsRatio == -1) {
|
||||
return -1;
|
||||
}
|
||||
long maxQueueSize = num * maxAnalyzedPathsPerOutput;
|
||||
// liveDocRatio can be at most 1.0 (if no docs were deleted)
|
||||
assert liveDocsRatio <= 1.0d;
|
||||
maxQueueSize = (long) (maxQueueSize / liveDocsRatio);
|
||||
if (filterEnabled) {
|
||||
maxQueueSize = maxQueueSize + (reader.numDocs()/2);
|
||||
}
|
||||
return (int) Math.min(MAX_TOP_N_QUEUE_SIZE, maxQueueSize);
|
||||
}
|
||||
|
||||
private static double calculateLiveDocRatio(int numDocs, int maxDocs) {
|
||||
return (numDocs > 0) ? ((double) numDocs / maxDocs) : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a {@link NRTSuggester} from {@link org.apache.lucene.store.IndexInput}
|
||||
*/
|
||||
public static NRTSuggester load(IndexInput input) throws IOException {
|
||||
final FST<Pair<Long, BytesRef>> fst = new FST<>(input, new PairOutputs<>(
|
||||
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
|
||||
|
||||
/* read some meta info */
|
||||
int maxAnalyzedPathsPerOutput = input.readVInt();
|
||||
int endByte = input.readVInt();
|
||||
int payloadSep = input.readVInt();
|
||||
|
||||
return new NRTSuggester(fst, maxAnalyzedPathsPerOutput, payloadSep, endByte);
|
||||
}
|
||||
|
||||
static long encode(long input) {
|
||||
if (input < 0) {
|
||||
throw new UnsupportedOperationException("cannot encode value: " + input);
|
||||
}
|
||||
return Long.MAX_VALUE - input;
|
||||
}
|
||||
|
||||
static long decode(long output) {
|
||||
return (Long.MAX_VALUE - output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper to encode/decode payload (surface + PAYLOAD_SEP + docID) output
|
||||
*/
|
||||
static final class PayLoadProcessor {
|
||||
final static private int MAX_DOC_ID_LEN_WITH_SEP = 6; // vint takes at most 5 bytes
|
||||
|
||||
static int parseSurfaceForm(final BytesRef output, int payloadSep, CharsRefBuilder spare) {
|
||||
int surfaceFormLen = -1;
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
if (output.bytes[output.offset + i] == payloadSep) {
|
||||
surfaceFormLen = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert surfaceFormLen != -1 : "no payloadSep found, unable to determine surface form";
|
||||
spare.copyUTF8Bytes(output.bytes, output.offset, surfaceFormLen);
|
||||
return surfaceFormLen;
|
||||
}
|
||||
|
||||
static int parseDocID(final BytesRef output, int payloadSepIndex) {
|
||||
assert payloadSepIndex != -1 : "payload sep index can not be -1";
|
||||
ByteArrayDataInput input = new ByteArrayDataInput(output.bytes, payloadSepIndex + output.offset + 1, output.length - (payloadSepIndex + output.offset));
|
||||
return input.readVInt();
|
||||
}
|
||||
|
||||
static BytesRef make(final BytesRef surface, int docID, int payloadSep) throws IOException {
|
||||
int len = surface.length + MAX_DOC_ID_LEN_WITH_SEP;
|
||||
byte[] buffer = new byte[len];
|
||||
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
||||
output.writeBytes(surface.bytes, surface.length - surface.offset);
|
||||
output.writeByte((byte) payloadSep);
|
||||
output.writeVInt(docID);
|
||||
return new BytesRef(buffer, 0, output.getPosition());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.PriorityQueue;
|
||||
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PairOutputs;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
import static org.apache.lucene.search.suggest.document.NRTSuggester.encode;
|
||||
|
||||
/**
|
||||
* Builder for {@link NRTSuggester}
|
||||
*
|
||||
*/
|
||||
final class NRTSuggesterBuilder {
|
||||
|
||||
/**
|
||||
* Label used to separate surface form and docID
|
||||
* in the output
|
||||
*/
|
||||
public static final int PAYLOAD_SEP = '\u001F';
|
||||
|
||||
/**
|
||||
* Marks end of the analyzed input and start of dedup
|
||||
* byte.
|
||||
*/
|
||||
private static final int END_BYTE = 0x0;
|
||||
|
||||
private final PairOutputs<Long, BytesRef> outputs;
|
||||
private final Builder<PairOutputs.Pair<Long, BytesRef>> builder;
|
||||
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||
private final BytesRefBuilder analyzed = new BytesRefBuilder();
|
||||
private final PriorityQueue<Entry> entries;
|
||||
private final int payloadSep;
|
||||
private final int endByte;
|
||||
|
||||
private int maxAnalyzedPathsPerOutput = 0;
|
||||
|
||||
/**
|
||||
* Create a builder for {@link NRTSuggester}
|
||||
*/
|
||||
public NRTSuggesterBuilder() {
|
||||
this.payloadSep = PAYLOAD_SEP;
|
||||
this.endByte = END_BYTE;
|
||||
this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
|
||||
this.entries = new PriorityQueue<>();
|
||||
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes an FST input term to add entries against
|
||||
*/
|
||||
public void startTerm(BytesRef analyzed) {
|
||||
this.analyzed.copyBytes(analyzed);
|
||||
this.analyzed.append((byte) endByte);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds an entry for the latest input term, should be called after
|
||||
* {@link #startTerm(org.apache.lucene.util.BytesRef)} on the desired input
|
||||
*/
|
||||
public void addEntry(int docID, BytesRef surfaceForm, long weight) throws IOException {
|
||||
BytesRef payloadRef = NRTSuggester.PayLoadProcessor.make(surfaceForm, docID, payloadSep);
|
||||
entries.add(new Entry(payloadRef, encode(weight)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes all the entries for the FST input term
|
||||
*/
|
||||
public void finishTerm() throws IOException {
|
||||
int numArcs = 0;
|
||||
int numDedupBytes = 1;
|
||||
analyzed.grow(analyzed.length() + 1);
|
||||
analyzed.setLength(analyzed.length() + 1);
|
||||
for (Entry entry : entries) {
|
||||
if (numArcs == maxNumArcsForDedupByte(numDedupBytes)) {
|
||||
analyzed.setByteAt(analyzed.length() - 1, (byte) (numArcs));
|
||||
analyzed.grow(analyzed.length() + 1);
|
||||
analyzed.setLength(analyzed.length() + 1);
|
||||
numArcs = 0;
|
||||
numDedupBytes++;
|
||||
}
|
||||
analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
|
||||
Util.toIntsRef(analyzed.get(), scratchInts);
|
||||
builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
|
||||
}
|
||||
maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
|
||||
entries.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds and stores a FST that can be loaded with
|
||||
* {@link NRTSuggester#load(org.apache.lucene.store.IndexInput)}
|
||||
*/
|
||||
public boolean store(DataOutput output) throws IOException {
|
||||
final FST<PairOutputs.Pair<Long, BytesRef>> build = builder.finish();
|
||||
if (build == null) {
|
||||
return false;
|
||||
}
|
||||
build.save(output);
|
||||
|
||||
/* write some more meta-info */
|
||||
assert maxAnalyzedPathsPerOutput > 0;
|
||||
output.writeVInt(maxAnalyzedPathsPerOutput);
|
||||
output.writeVInt(END_BYTE);
|
||||
output.writeVInt(PAYLOAD_SEP);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Num arcs for nth dedup byte:
|
||||
* if n <= 5: 1 + (2 * n)
|
||||
* else: (1 + (2 * n)) * n
|
||||
* <p>
|
||||
* TODO: is there a better way to make the fst built to be
|
||||
* more TopNSearcher friendly?
|
||||
*/
|
||||
private static int maxNumArcsForDedupByte(int currentNumDedupBytes) {
|
||||
int maxArcs = 1 + (2 * currentNumDedupBytes);
|
||||
if (currentNumDedupBytes > 5) {
|
||||
maxArcs *= currentNumDedupBytes;
|
||||
}
|
||||
return Math.min(maxArcs, 255);
|
||||
}
|
||||
|
||||
private final static class Entry implements Comparable<Entry> {
|
||||
final BytesRef payload;
|
||||
final long weight;
|
||||
|
||||
public Entry(BytesRef payload, long weight) {
|
||||
this.payload = payload;
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Entry o) {
|
||||
return Long.compare(weight, o.weight);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Field that indexes a string value and a weight as a weighted completion
|
||||
* against a named suggester.
|
||||
* Field is tokenized, not stored and stores documents, frequencies and positions.
|
||||
* Field can be used to provide near real time document suggestions.
|
||||
* </p>
|
||||
* <p>
|
||||
* Besides the usual {@link org.apache.lucene.analysis.Analyzer}s,
|
||||
* {@link CompletionAnalyzer}
|
||||
* can be used to tune suggest field only parameters
|
||||
* (e.g. preserving token seperators, preserving position increments
|
||||
* when converting the token stream to an automaton)
|
||||
* </p>
|
||||
* <p>
|
||||
* Example indexing usage:
|
||||
* <pre class="prettyprint">
|
||||
* document.add(new SuggestField(name, "suggestion", 4));
|
||||
* </pre>
|
||||
* To perform document suggestions based on the this field, use
|
||||
* {@link SuggestIndexSearcher#suggest(String, CharSequence, int, org.apache.lucene.search.Filter)}
|
||||
* <p>
|
||||
* Example query usage:
|
||||
* <pre class="prettyprint">
|
||||
* SuggestIndexSearcher indexSearcher = ..
|
||||
* indexSearcher.suggest(name, "su", 2)
|
||||
* </pre>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SuggestField extends Field {
|
||||
|
||||
private static final FieldType FIELD_TYPE = new FieldType();
|
||||
|
||||
static {
|
||||
FIELD_TYPE.setTokenized(true);
|
||||
FIELD_TYPE.setStored(false);
|
||||
FIELD_TYPE.setStoreTermVectors(false);
|
||||
FIELD_TYPE.setOmitNorms(false);
|
||||
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||
FIELD_TYPE.freeze();
|
||||
}
|
||||
|
||||
private final BytesRef surfaceForm;
|
||||
private final long weight;
|
||||
|
||||
/**
|
||||
* Creates a {@link SuggestField}
|
||||
*
|
||||
* @param name of the field
|
||||
* @param value to get suggestions on
|
||||
* @param weight weight of the suggestion
|
||||
*/
|
||||
public SuggestField(String name, String value, long weight) {
|
||||
super(name, value, FIELD_TYPE);
|
||||
if (weight < 0l) {
|
||||
throw new IllegalArgumentException("weight must be >= 0");
|
||||
}
|
||||
this.surfaceForm = new BytesRef(value);
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
|
||||
TokenStream stream = super.tokenStream(analyzer, reuse);
|
||||
CompletionTokenStream completionStream;
|
||||
if (stream instanceof CompletionTokenStream) {
|
||||
completionStream = (CompletionTokenStream) stream;
|
||||
} else {
|
||||
completionStream = new CompletionTokenStream(stream);
|
||||
}
|
||||
BytesRef suggestPayload = buildSuggestPayload(surfaceForm, weight, (char) completionStream.sepLabel());
|
||||
completionStream.setPayload(suggestPayload);
|
||||
return completionStream;
|
||||
}
|
||||
|
||||
private BytesRef buildSuggestPayload(BytesRef surfaceForm, long weight, char sepLabel) throws IOException {
|
||||
for (int i = 0; i < surfaceForm.length; i++) {
|
||||
if (surfaceForm.bytes[i] == sepLabel) {
|
||||
assert sepLabel == '\u001f';
|
||||
throw new IllegalArgumentException(
|
||||
"surface form cannot contain unit separator character U+001F; this character is reserved");
|
||||
}
|
||||
}
|
||||
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
|
||||
output.writeVInt(surfaceForm.length);
|
||||
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
|
||||
output.writeVLong(weight + 1);
|
||||
}
|
||||
return new BytesRef(byteArrayOutputStream.toByteArray());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
|
||||
import static org.apache.lucene.search.suggest.document.CompletionFieldsProducer.CompletionTerms;
|
||||
|
||||
/**
|
||||
* Adds document suggest capabilities to IndexSearcher
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class SuggestIndexSearcher extends IndexSearcher {
|
||||
|
||||
private final Analyzer queryAnalyzer;
|
||||
|
||||
/**
|
||||
* Creates a searcher with document suggest capabilities
|
||||
* for <code>reader</code>.
|
||||
* <p>
|
||||
* Suggestion <code>key</code> is analyzed with <code>queryAnalyzer</code>
|
||||
*/
|
||||
public SuggestIndexSearcher(IndexReader reader, Analyzer queryAnalyzer) {
|
||||
super(reader);
|
||||
this.queryAnalyzer = queryAnalyzer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #suggest(String, CharSequence, int, Filter)}
|
||||
* with no document filter
|
||||
*/
|
||||
public TopSuggestDocs suggest(String field, CharSequence key, int num) throws IOException {
|
||||
return suggest(field, key, num, (Filter) null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #suggest(String, CharSequence, int, Filter, TopSuggestDocsCollector)}
|
||||
* with no document filter
|
||||
*/
|
||||
public void suggest(String field, CharSequence key, int num, TopSuggestDocsCollector collector) throws IOException {
|
||||
suggest(field, key, num, null, collector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Suggests at most <code>num</code> documents filtered by <code>filter</code>
|
||||
* that completes to <code>key</code> for a suggest <code>field</code>
|
||||
* <p>
|
||||
* Returns at most Top <code>num</code> document ids with corresponding completion and weight pair
|
||||
*
|
||||
* @throws java.lang.IllegalArgumentException if <code>filter</code> does not provide a random access
|
||||
* interface or if <code>field</code> is not a {@link SuggestField}
|
||||
*/
|
||||
public TopSuggestDocs suggest(String field, CharSequence key, int num, Filter filter) throws IOException {
|
||||
TopSuggestDocsCollector collector = new TopSuggestDocsCollector(num);
|
||||
suggest(field, key, num, filter, collector);
|
||||
return collector.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Suggests at most <code>num</code> documents filtered by <code>filter</code>
|
||||
* that completes to <code>key</code> for a suggest <code>field</code>
|
||||
* <p>
|
||||
* Collect completions with {@link TopSuggestDocsCollector}
|
||||
* The completions are collected in order of the suggest <code>field</code> weight.
|
||||
* There can be more than one collection of the same document, if the <code>key</code>
|
||||
* matches multiple <code>field</code> values of the same document
|
||||
*
|
||||
* @throws java.lang.IllegalArgumentException if <code>filter</code> does not provide a random access
|
||||
* interface or if <code>field</code> is not a {@link SuggestField}
|
||||
*/
|
||||
public void suggest(String field, CharSequence key, int num, Filter filter, TopSuggestDocsCollector collector) throws IOException {
|
||||
// verify input
|
||||
if (field == null) {
|
||||
throw new IllegalArgumentException("'field' can not be null");
|
||||
}
|
||||
if (num <= 0) {
|
||||
throw new IllegalArgumentException("'num' should be > 0");
|
||||
}
|
||||
if (collector == null) {
|
||||
throw new IllegalArgumentException("'collector' can not be null");
|
||||
}
|
||||
|
||||
// build query automaton
|
||||
CompletionAnalyzer analyzer;
|
||||
if (queryAnalyzer instanceof CompletionAnalyzer) {
|
||||
analyzer = (CompletionAnalyzer) queryAnalyzer;
|
||||
} else {
|
||||
analyzer = new CompletionAnalyzer(queryAnalyzer);
|
||||
}
|
||||
final Automaton automaton = analyzer.toAutomaton(field, key);
|
||||
|
||||
// collect results
|
||||
for (LeafReaderContext context : getIndexReader().leaves()) {
|
||||
TopSuggestDocsCollector leafCollector = (TopSuggestDocsCollector) collector.getLeafCollector(context);
|
||||
LeafReader reader = context.reader();
|
||||
Terms terms = reader.terms(field);
|
||||
if (terms == null) {
|
||||
continue;
|
||||
}
|
||||
NRTSuggester suggester;
|
||||
if (terms instanceof CompletionTerms) {
|
||||
CompletionTerms completionTerms = (CompletionTerms) terms;
|
||||
suggester = completionTerms.suggester();
|
||||
} else {
|
||||
throw new IllegalArgumentException(field + " is not a SuggestField");
|
||||
}
|
||||
if (suggester == null) {
|
||||
// a segment can have a null suggester
|
||||
// i.e. no FST was built
|
||||
continue;
|
||||
}
|
||||
|
||||
DocIdSet docIdSet = null;
|
||||
if (filter != null) {
|
||||
docIdSet = filter.getDocIdSet(context, reader.getLiveDocs());
|
||||
if (docIdSet == null) {
|
||||
// filter matches no docs in current leave
|
||||
continue;
|
||||
}
|
||||
}
|
||||
suggester.lookup(reader, automaton, num, docIdSet, leafCollector);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* Bounded priority queue for {@link SuggestScoreDoc}s.
|
||||
* Priority is based on {@link SuggestScoreDoc#score} and tie
|
||||
* is broken by {@link SuggestScoreDoc#doc}
|
||||
*/
|
||||
final class SuggestScoreDocPriorityQueue extends PriorityQueue<SuggestScoreDoc> {
|
||||
/**
|
||||
* Creates a new priority queue of the specified size.
|
||||
*/
|
||||
public SuggestScoreDocPriorityQueue(int size) {
|
||||
super(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(SuggestScoreDoc a, SuggestScoreDoc b) {
|
||||
if (a.score == b.score) {
|
||||
// prefer smaller doc id, in case of a tie
|
||||
return a.doc > b.doc;
|
||||
}
|
||||
return a.score < b.score;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the top N results in descending order.
|
||||
*/
|
||||
public SuggestScoreDoc[] getResults() {
|
||||
int size = size();
|
||||
SuggestScoreDoc[] res = new SuggestScoreDoc[size];
|
||||
for (int i = size - 1; i >= 0; i--) {
|
||||
res[i] = pop();
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,111 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
|
||||
/**
|
||||
* {@link org.apache.lucene.search.TopDocs} wrapper with
|
||||
* an additional CharSequence key per {@link org.apache.lucene.search.ScoreDoc}
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class TopSuggestDocs extends TopDocs {
|
||||
|
||||
/**
|
||||
* Singleton for empty {@link TopSuggestDocs}
|
||||
*/
|
||||
public final static TopSuggestDocs EMPTY = new TopSuggestDocs(0, new SuggestScoreDoc[0], 0);
|
||||
|
||||
/**
|
||||
* {@link org.apache.lucene.search.ScoreDoc} with an
|
||||
* additional CharSequence key
|
||||
*/
|
||||
public static class SuggestScoreDoc extends ScoreDoc implements Comparable<SuggestScoreDoc> {
|
||||
|
||||
/**
|
||||
* Matched completion key
|
||||
*/
|
||||
public CharSequence key;
|
||||
|
||||
/**
|
||||
* Creates a SuggestScoreDoc instance
|
||||
*
|
||||
* @param doc document id (hit)
|
||||
* @param key matched completion
|
||||
* @param score weight of the matched completion
|
||||
*/
|
||||
public SuggestScoreDoc(int doc, CharSequence key, long score) {
|
||||
// loss of precision but not magnitude
|
||||
// implicit conversion from long -> float
|
||||
super(doc, score);
|
||||
this.key = key;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(SuggestScoreDoc o) {
|
||||
return Lookup.CHARSEQUENCE_COMPARATOR.compare(key, o.key);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link org.apache.lucene.search.TopDocs} wrapper with
|
||||
* {@link TopSuggestDocs.SuggestScoreDoc}
|
||||
* instead of {@link org.apache.lucene.search.ScoreDoc}
|
||||
*/
|
||||
public TopSuggestDocs(int totalHits, SuggestScoreDoc[] scoreDocs, float maxScore) {
|
||||
super(totalHits, scoreDocs, maxScore);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns {@link TopSuggestDocs.SuggestScoreDoc}s
|
||||
* for this instance
|
||||
*/
|
||||
public SuggestScoreDoc[] scoreLookupDocs() {
|
||||
return (SuggestScoreDoc[]) scoreDocs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new TopSuggestDocs, containing topN results across
|
||||
* the provided TopSuggestDocs, sorting by score. Each {@link TopSuggestDocs}
|
||||
* instance must be sorted.
|
||||
* Analogous to {@link org.apache.lucene.search.TopDocs#merge(int, org.apache.lucene.search.TopDocs[])}
|
||||
* for {@link TopSuggestDocs}
|
||||
*
|
||||
* NOTE: assumes every <code>shardHit</code> is already sorted by score
|
||||
*/
|
||||
public static TopSuggestDocs merge(int topN, TopSuggestDocs[] shardHits) {
|
||||
SuggestScoreDocPriorityQueue priorityQueue = new SuggestScoreDocPriorityQueue(topN);
|
||||
for (TopSuggestDocs shardHit : shardHits) {
|
||||
for (SuggestScoreDoc scoreDoc : shardHit.scoreLookupDocs()) {
|
||||
if (scoreDoc == priorityQueue.insertWithOverflow(scoreDoc)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
SuggestScoreDoc[] topNResults = priorityQueue.getResults();
|
||||
if (topNResults.length > 0) {
|
||||
return new TopSuggestDocs(topNResults.length, topNResults, topNResults[0].score);
|
||||
} else {
|
||||
return TopSuggestDocs.EMPTY;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.CollectionTerminatedException;
|
||||
import org.apache.lucene.search.SimpleCollector;
|
||||
|
||||
import static org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;
|
||||
|
||||
/**
|
||||
* {@link org.apache.lucene.search.Collector} that collects completion and
|
||||
* score, along with document id
|
||||
* <p>
|
||||
* Non scoring collector that collect completions in order of their
|
||||
* pre-defined weight.
|
||||
* <p>
|
||||
* NOTE: One document can be collected multiple times if a document
|
||||
* is matched for multiple unique completions for a given query
|
||||
* <p>
|
||||
* Subclasses should only override {@link TopSuggestDocsCollector#collect(int, CharSequence, long)},
|
||||
* {@link #setScorer(org.apache.lucene.search.Scorer)} is not
|
||||
* used
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class TopSuggestDocsCollector extends SimpleCollector {
|
||||
|
||||
private final SuggestScoreDocPriorityQueue priorityQueue;
|
||||
|
||||
/**
|
||||
* Document base offset for the current Leaf
|
||||
*/
|
||||
protected int docBase;
|
||||
|
||||
/**
|
||||
* Sole constructor
|
||||
*
|
||||
* Collects at most <code>num</code> completions
|
||||
* with corresponding document and weight
|
||||
*/
|
||||
public TopSuggestDocsCollector(int num) {
|
||||
if (num <= 0) {
|
||||
throw new IllegalArgumentException("'num' must be > 0");
|
||||
}
|
||||
this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doSetNextReader(LeafReaderContext context) throws IOException {
|
||||
docBase = context.docBase;
|
||||
}
|
||||
|
||||
/**
|
||||
* Called for every matched completion,
|
||||
* similar to {@link org.apache.lucene.search.LeafCollector#collect(int)}
|
||||
* but for completions.
|
||||
*
|
||||
* NOTE: collection at the leaf level is guaranteed to be in
|
||||
* descending order of score
|
||||
*/
|
||||
public void collect(int docID, CharSequence key, long score) throws IOException {
|
||||
SuggestScoreDoc current = new SuggestScoreDoc(docBase + docID, key, score);
|
||||
if (current == priorityQueue.insertWithOverflow(current)) {
|
||||
// if the current SuggestScoreDoc has overflown from pq,
|
||||
// we can assume all of the successive collections from
|
||||
// this leaf will be overflown as well
|
||||
// TODO: reuse the overflow instance?
|
||||
throw new CollectionTerminatedException();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns at most <code>num</code> Top scoring {@link org.apache.lucene.search.suggest.document.TopSuggestDocs}s
|
||||
*/
|
||||
public TopSuggestDocs get() throws IOException {
|
||||
SuggestScoreDoc[] suggestScoreDocs = priorityQueue.getResults();
|
||||
if (suggestScoreDocs.length > 0) {
|
||||
return new TopSuggestDocs(suggestScoreDocs.length, suggestScoreDocs, suggestScoreDocs[0].score);
|
||||
} else {
|
||||
return TopSuggestDocs.EMPTY;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Ignored
|
||||
*/
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
// {@link #collect(int, CharSequence, long)} is used
|
||||
// instead
|
||||
}
|
||||
|
||||
/**
|
||||
* Ignored
|
||||
*/
|
||||
@Override
|
||||
public boolean needsScores() {
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Support for document suggestion
|
||||
*/
|
||||
package org.apache.lucene.search.suggest.document;
|
|
@ -0,0 +1,33 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.search.suggest.document.Completion50PostingsFormat
|
|
@ -0,0 +1,786 @@
|
|||
package org.apache.lucene.search.suggest.document;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.CopyOnWriteArrayList;
|
||||
import java.util.concurrent.CyclicBarrier;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.IntField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.StorableField;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.queries.TermsQuery;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.NumericRangeQuery;
|
||||
import org.apache.lucene.search.QueryWrapperFilter;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.LineFileDocs;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.apache.lucene.search.suggest.document.TopSuggestDocs.*;
|
||||
import static org.hamcrest.core.IsEqual.equalTo;
|
||||
|
||||
public class SuggestFieldTest extends LuceneTestCase {
|
||||
|
||||
public Directory dir;
|
||||
|
||||
@Before
|
||||
public void before() throws Exception {
|
||||
dir = newDirectory();
|
||||
}
|
||||
|
||||
@After
|
||||
public void after() throws Exception {
|
||||
dir.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimple() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
Document document = new Document();
|
||||
|
||||
document.add(newSuggestField("suggest_field", "abc", 3l));
|
||||
document.add(newSuggestField("suggest_field", "abd", 4l));
|
||||
document.add(newSuggestField("suggest_field", "The Foo Fighters", 2l));
|
||||
iw.addDocument(document);
|
||||
document.clear();
|
||||
document.add(newSuggestField("suggest_field", "abcdd", 5));
|
||||
iw.addDocument(document);
|
||||
|
||||
if (rarely()) {
|
||||
iw.commit();
|
||||
}
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
TopSuggestDocs lookupDocs = suggestIndexSearcher.suggest("suggest_field", "ab", 3);
|
||||
assertSuggestions(lookupDocs, new Entry("abcdd", 5), new Entry("abd", 4), new Entry("abc", 3));
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultipleSuggestFieldsPerDoc() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "sug_field_1", "sug_field_2"));
|
||||
|
||||
Document document = new Document();
|
||||
document.add(newSuggestField("sug_field_1", "apple", 4));
|
||||
document.add(newSuggestField("sug_field_2", "april", 3));
|
||||
iw.addDocument(document);
|
||||
document.clear();
|
||||
document.add(newSuggestField("sug_field_1", "aples", 3));
|
||||
document.add(newSuggestField("sug_field_2", "apartment", 2));
|
||||
iw.addDocument(document);
|
||||
|
||||
if (rarely()) {
|
||||
iw.commit();
|
||||
}
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
|
||||
SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
TopSuggestDocs suggestDocs1 = suggestIndexSearcher.suggest("sug_field_1", "ap", 4);
|
||||
assertSuggestions(suggestDocs1, new Entry("apple", 4), new Entry("aples", 3));
|
||||
TopSuggestDocs suggestDocs2 = suggestIndexSearcher.suggest("sug_field_2", "ap", 4);
|
||||
assertSuggestions(suggestDocs2, new Entry("april", 3), new Entry("apartment", 2));
|
||||
|
||||
// check that the doc ids are consistent
|
||||
for (int i = 0; i < suggestDocs1.scoreDocs.length; i++) {
|
||||
ScoreDoc suggestScoreDoc = suggestDocs1.scoreDocs[i];
|
||||
assertThat(suggestScoreDoc.doc, equalTo(suggestDocs2.scoreDocs[i].doc));
|
||||
}
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDupSuggestFieldValues() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
int num = atLeast(300);
|
||||
long[] weights = new long[num];
|
||||
for(int i = 0; i < num; i++) {
|
||||
Document document = new Document();
|
||||
weights[i] = Math.abs(random().nextLong());
|
||||
document.add(newSuggestField("suggest_field", "abc", weights[i]));
|
||||
iw.addDocument(document);
|
||||
}
|
||||
if (rarely()) {
|
||||
iw.commit();
|
||||
}
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
Entry[] expectedEntries = new Entry[num];
|
||||
Arrays.sort(weights);
|
||||
for (int i = 1; i <= num; i++) {
|
||||
expectedEntries[i - 1] = new Entry("abc", weights[num - i]);
|
||||
}
|
||||
|
||||
SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
TopSuggestDocs lookupDocs = suggestIndexSearcher.suggest("suggest_field", "abc", num);
|
||||
assertSuggestions(lookupDocs, expectedEntries);
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNRTDeletedDocFiltering() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
// using IndexWriter instead of RandomIndexWriter
|
||||
IndexWriter iw = new IndexWriter(dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
|
||||
int num = atLeast(10);
|
||||
|
||||
Document document = new Document();
|
||||
int numLive = 0;
|
||||
List<Entry> expectedEntries = new ArrayList<>();
|
||||
for (int i = 0; i < num; i++) {
|
||||
document.add(newSuggestField("suggest_field", "abc_" + i, num - i));
|
||||
if (i % 2 == 0) {
|
||||
document.add(newStringField("str_field", "delete", Field.Store.YES));
|
||||
} else {
|
||||
numLive++;
|
||||
expectedEntries.add(new Entry("abc_" + i, num - i));
|
||||
document.add(newStringField("str_field", "no_delete", Field.Store.YES));
|
||||
}
|
||||
iw.addDocument(document);
|
||||
document.clear();
|
||||
}
|
||||
|
||||
// get docIDs to delete
|
||||
DirectoryReader reader = DirectoryReader.open(iw, false);
|
||||
List<Integer> docIdsToDelete = new ArrayList<>();
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
StoredDocument doc = reader.document(i);
|
||||
if ("delete".equals(doc.get("str_field"))) {
|
||||
docIdsToDelete.add(i);
|
||||
}
|
||||
}
|
||||
|
||||
for (Integer docID : docIdsToDelete) {
|
||||
assertTrue(iw.tryDeleteDocument(reader, docID));
|
||||
}
|
||||
reader.close();
|
||||
|
||||
reader = DirectoryReader.open(iw, false);
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", numLive);
|
||||
assertSuggestions(suggest, expectedEntries.toArray(new Entry[expectedEntries.size()]));
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSuggestOnAllFilteredDocuments() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
int num = atLeast(10);
|
||||
Document document = new Document();
|
||||
for (int i = 0; i < num; i++) {
|
||||
document.add(newSuggestField("suggest_field", "abc_" + i, i));
|
||||
document.add(newStringField("str_fld", "deleted", Field.Store.NO));
|
||||
iw.addDocument(document);
|
||||
document.clear();
|
||||
}
|
||||
|
||||
Filter filter = new QueryWrapperFilter(new TermsQuery("str_fld", new BytesRef("non_existent")));
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
// no random access required;
|
||||
// calling suggest with filter that does not match any documents should early terminate
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", num, filter);
|
||||
assertThat(suggest.totalHits, equalTo(0));
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSuggestOnAllDeletedDocuments() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
// using IndexWriter instead of RandomIndexWriter
|
||||
IndexWriter iw = new IndexWriter(dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
int num = atLeast(10);
|
||||
Document document = new Document();
|
||||
for (int i = 0; i < num; i++) {
|
||||
document.add(newSuggestField("suggest_field", "abc_" + i, i));
|
||||
iw.addDocument(document);
|
||||
document.clear();
|
||||
}
|
||||
|
||||
DirectoryReader reader = DirectoryReader.open(iw, false);
|
||||
for (int docID = 0; docID < reader.maxDoc(); docID++) {
|
||||
assertTrue(iw.tryDeleteDocument(reader, docID));
|
||||
}
|
||||
reader.close();
|
||||
|
||||
reader = DirectoryReader.open(iw, false);
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", num);
|
||||
assertThat(suggest.totalHits, equalTo(0));
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSuggestOnMostlyDeletedDocuments() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
// using IndexWriter instead of RandomIndexWriter
|
||||
IndexWriter iw = new IndexWriter(dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
int num = atLeast(10);
|
||||
Document document = new Document();
|
||||
for (int i = 1; i <= num; i++) {
|
||||
document.add(newSuggestField("suggest_field", "abc_" + i, i));
|
||||
document.add(new IntField("weight_fld", i, Field.Store.YES));
|
||||
iw.addDocument(document);
|
||||
document.clear();
|
||||
}
|
||||
|
||||
DirectoryReader reader = DirectoryReader.open(iw, false);
|
||||
// delete all but the lowest scored suggestion
|
||||
for (int docID = 0; docID < reader.maxDoc(); docID++) {
|
||||
StoredDocument doc = reader.document(docID);
|
||||
StorableField[] weights = doc.getFields("weight_fld");
|
||||
assertThat(weights.length, equalTo(1));
|
||||
int weight = (int) weights[0].numericValue();
|
||||
if (weight != 1) {
|
||||
assertTrue(iw.tryDeleteDocument(reader, docID));
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
|
||||
reader = DirectoryReader.open(iw, false);
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", 1);
|
||||
assertSuggestions(suggest, new Entry("abc_1", 1));
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSuggestOnMostlyFilteredOutDocuments() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
int num = atLeast(10);
|
||||
Document document = new Document();
|
||||
for (int i = 0; i < num; i++) {
|
||||
document.add(newSuggestField("suggest_field", "abc_" + i, i));
|
||||
document.add(new IntField("filter_int_fld", i, Field.Store.NO));
|
||||
iw.addDocument(document);
|
||||
document.clear();
|
||||
}
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
|
||||
int topScore = num/2;
|
||||
QueryWrapperFilter filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", 0, topScore, true, true));
|
||||
Filter filter = randomAccessFilter(filterWrapper);
|
||||
// if at most half of the top scoring documents have been filtered out
|
||||
// the search should be admissible
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", 1, filter);
|
||||
assertSuggestions(suggest, new Entry("abc_" + topScore, topScore));
|
||||
|
||||
filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", 0, 0, true, true));
|
||||
filter = randomAccessFilter(filterWrapper);
|
||||
// if more than half of the top scoring documents have been filtered out
|
||||
// search is not admissible, so # of suggestions requested is num instead of 1
|
||||
suggest = indexSearcher.suggest("suggest_field", "abc_", num, filter);
|
||||
assertSuggestions(suggest, new Entry("abc_0", 0));
|
||||
|
||||
filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", num - 1, num - 1, true, true));
|
||||
filter = randomAccessFilter(filterWrapper);
|
||||
// if only lower scoring documents are filtered out
|
||||
// search is admissible
|
||||
suggest = indexSearcher.suggest("suggest_field", "abc_", 1, filter);
|
||||
assertSuggestions(suggest, new Entry("abc_" + (num - 1), num - 1));
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEarlyTermination() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
int num = atLeast(10);
|
||||
Document document = new Document();
|
||||
|
||||
// have segments of 4 documents
|
||||
// with descending suggestion weights
|
||||
// suggest should early terminate for
|
||||
// segments with docs having lower suggestion weights
|
||||
for (int i = num; i > 0; i--) {
|
||||
document.add(newSuggestField("suggest_field", "abc_" + i, i));
|
||||
iw.addDocument(document);
|
||||
document.clear();
|
||||
if (i % 4 == 0) {
|
||||
iw.commit();
|
||||
}
|
||||
}
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", 1);
|
||||
assertSuggestions(suggest, new Entry("abc_" + num, num));
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultipleSegments() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
int num = atLeast(10);
|
||||
Document document = new Document();
|
||||
List<Entry> entries = new ArrayList<>();
|
||||
|
||||
// ensure at least some segments have no suggest field
|
||||
for (int i = num; i > 0; i--) {
|
||||
if (random().nextInt(4) == 1) {
|
||||
document.add(newSuggestField("suggest_field", "abc_" + i, i));
|
||||
entries.add(new Entry("abc_" + i, i));
|
||||
}
|
||||
document.add(new IntField("weight_fld", i, Field.Store.YES));
|
||||
iw.addDocument(document);
|
||||
document.clear();
|
||||
if (usually()) {
|
||||
iw.commit();
|
||||
}
|
||||
}
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", (entries.size() == 0) ? 1 : entries.size());
|
||||
assertSuggestions(suggest, entries.toArray(new Entry[entries.size()]));
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDocFiltering() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
|
||||
Document document = new Document();
|
||||
document.add(new IntField("filter_int_fld", 9, Field.Store.NO));
|
||||
document.add(newSuggestField("suggest_field", "apples", 3));
|
||||
iw.addDocument(document);
|
||||
|
||||
document.clear();
|
||||
document.add(new IntField("filter_int_fld", 10, Field.Store.NO));
|
||||
document.add(newSuggestField("suggest_field", "applle", 4));
|
||||
iw.addDocument(document);
|
||||
|
||||
document.clear();
|
||||
document.add(new IntField("filter_int_fld", 4, Field.Store.NO));
|
||||
document.add(newSuggestField("suggest_field", "apple", 5));
|
||||
iw.addDocument(document);
|
||||
|
||||
if (rarely()) {
|
||||
iw.commit();
|
||||
}
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
|
||||
// suggest without filter
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "app", 3);
|
||||
assertSuggestions(suggest, new Entry("apple", 5), new Entry("applle", 4), new Entry("apples", 3));
|
||||
|
||||
// suggest with filter
|
||||
QueryWrapperFilter filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", 5, 12, true, true));
|
||||
Filter filter = randomAccessFilter(filterWrapper);
|
||||
suggest = indexSearcher.suggest("suggest_field", "app", 3, filter);
|
||||
assertSuggestions(suggest, new Entry("applle", 4), new Entry("apples", 3));
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReturnedDocID() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
|
||||
Document document = new Document();
|
||||
int num = atLeast(10);
|
||||
for (int i = 0; i < num; i++) {
|
||||
document.add(newSuggestField("suggest_field", "abc_" + i, num));
|
||||
document.add(new IntField("int_field", i, Field.Store.YES));
|
||||
iw.addDocument(document);
|
||||
document.clear();
|
||||
|
||||
if (random().nextBoolean()) {
|
||||
iw.commit();
|
||||
}
|
||||
}
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", num);
|
||||
assertEquals(num, suggest.totalHits);
|
||||
for (SuggestScoreDoc suggestScoreDoc : suggest.scoreLookupDocs()) {
|
||||
String key = suggestScoreDoc.key.toString();
|
||||
assertTrue(key.startsWith("abc_"));
|
||||
String substring = key.substring(4);
|
||||
int fieldValue = Integer.parseInt(substring);
|
||||
StoredDocument doc = reader.document(suggestScoreDoc.doc);
|
||||
assertEquals(doc.getField("int_field").numericValue().intValue(), fieldValue);
|
||||
}
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompletionAnalyzerOptions() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
|
||||
Map<String, Analyzer> map = new HashMap<>();
|
||||
map.put("suggest_field_default", new CompletionAnalyzer(analyzer));
|
||||
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, true);
|
||||
map.put("suggest_field_no_p_sep", completionAnalyzer);
|
||||
completionAnalyzer = new CompletionAnalyzer(analyzer, true, false);
|
||||
map.put("suggest_field_no_p_pos_inc", completionAnalyzer);
|
||||
completionAnalyzer = new CompletionAnalyzer(analyzer, false, false);
|
||||
map.put("suggest_field_no_p_sep_or_pos_inc", completionAnalyzer);
|
||||
PerFieldAnalyzerWrapper analyzerWrapper = new PerFieldAnalyzerWrapper(analyzer, map);
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzerWrapper, map.keySet()));
|
||||
|
||||
Document document = new Document();
|
||||
document.add(newSuggestField("suggest_field_default", "foobar", 7));
|
||||
document.add(newSuggestField("suggest_field_default", "foo bar", 8));
|
||||
document.add(newSuggestField("suggest_field_default", "the fo", 9));
|
||||
document.add(newSuggestField("suggest_field_default", "the foo bar", 10));
|
||||
|
||||
document.add(newSuggestField("suggest_field_no_p_sep", "foobar", 7));
|
||||
document.add(newSuggestField("suggest_field_no_p_sep", "foo bar", 8));
|
||||
document.add(newSuggestField("suggest_field_no_p_sep", "the fo", 9));
|
||||
document.add(newSuggestField("suggest_field_no_p_sep", "the foo bar", 10));
|
||||
|
||||
document.add(newSuggestField("suggest_field_no_p_pos_inc", "foobar", 7));
|
||||
document.add(newSuggestField("suggest_field_no_p_pos_inc", "foo bar", 8));
|
||||
document.add(newSuggestField("suggest_field_no_p_pos_inc", "the fo", 9));
|
||||
document.add(newSuggestField("suggest_field_no_p_pos_inc", "the foo bar", 10));
|
||||
|
||||
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "foobar", 7));
|
||||
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "foo bar", 8));
|
||||
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "the fo", 9));
|
||||
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "the foo bar", 10));
|
||||
iw.addDocument(document);
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
|
||||
TopSuggestDocs suggest;
|
||||
suggest = indexSearcher.suggest("suggest_field_default", "fo", 4);
|
||||
assertSuggestions(suggest, new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
suggest = indexSearcher.suggest("suggest_field_default", "foob", 4);
|
||||
assertSuggestions(suggest, new Entry("foobar", 7));
|
||||
|
||||
suggest = indexSearcher.suggest("suggest_field_no_p_sep", "fo", 4); // matches all 4
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
suggest = indexSearcher.suggest("suggest_field_no_p_sep", "foob", 4); // except the fo
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
|
||||
suggest = indexSearcher.suggest("suggest_field_no_p_pos_inc", "fo", 4); //matches all 4
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
suggest = indexSearcher.suggest("suggest_field_no_p_pos_inc", "foob", 4); // only foobar
|
||||
assertSuggestions(suggest, new Entry("foobar", 7));
|
||||
|
||||
suggest = indexSearcher.suggest("suggest_field_no_p_sep_or_pos_inc", "fo", 4); // all 4
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
suggest = indexSearcher.suggest("suggest_field_no_p_sep_or_pos_inc", "foob", 4); // not the fo
|
||||
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testScoring() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
|
||||
int num = atLeast(100);
|
||||
String[] prefixes = {"abc", "bac", "cab"};
|
||||
Map<String, Long> mappings = new HashMap<>();
|
||||
for (int i = 0; i < num; i++) {
|
||||
Document document = new Document();
|
||||
String suggest = prefixes[i % 3] + TestUtil.randomSimpleString(random(), 10) + "_" +String.valueOf(i);
|
||||
long weight = Math.abs(random().nextLong());
|
||||
document.add(newSuggestField("suggest_field", suggest, weight));
|
||||
mappings.put(suggest, weight);
|
||||
iw.addDocument(document);
|
||||
}
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
for (String prefix : prefixes) {
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", prefix, num);
|
||||
assertTrue(suggest.totalHits > 0);
|
||||
float topScore = -1;
|
||||
for (SuggestScoreDoc scoreDoc : suggest.scoreLookupDocs()) {
|
||||
if (topScore != -1) {
|
||||
assertTrue(topScore >= scoreDoc.score);
|
||||
}
|
||||
topScore = scoreDoc.score;
|
||||
assertThat((float) mappings.get(scoreDoc.key.toString()), equalTo(scoreDoc.score));
|
||||
assertNotNull(mappings.remove(scoreDoc.key.toString()));
|
||||
}
|
||||
}
|
||||
|
||||
assertThat(mappings.size(), equalTo(0));
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRealisticKeys() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
|
||||
LineFileDocs lineFileDocs = new LineFileDocs(random());
|
||||
int num = atLeast(100);
|
||||
Map<String, Long> mappings = new HashMap<>();
|
||||
for (int i = 0; i < num; i++) {
|
||||
Document document = lineFileDocs.nextDoc();
|
||||
String title = document.getField("title").stringValue();
|
||||
long weight = Math.abs(random().nextLong());
|
||||
Long prevWeight = mappings.get(title);
|
||||
if (prevWeight == null || prevWeight < weight) {
|
||||
mappings.put(title, weight);
|
||||
}
|
||||
Document doc = new Document();
|
||||
doc.add(newSuggestField("suggest_field", title, weight));
|
||||
iw.addDocument(doc);
|
||||
|
||||
if (rarely()) {
|
||||
iw.commit();
|
||||
}
|
||||
}
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
|
||||
for (Map.Entry<String, Long> entry : mappings.entrySet()) {
|
||||
String title = entry.getKey();
|
||||
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", title, mappings.size());
|
||||
assertTrue(suggest.totalHits > 0);
|
||||
boolean matched = false;
|
||||
for (ScoreDoc scoreDoc : suggest.scoreDocs) {
|
||||
matched = Float.compare(scoreDoc.score, (float) entry.getValue()) == 0;
|
||||
if (matched) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue("at least one of the entries should have the score", matched);
|
||||
}
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testThreads() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field_1", "suggest_field_2", "suggest_field_3"));
|
||||
int num = atLeast(100);
|
||||
final String prefix1 = "abc1_";
|
||||
final String prefix2 = "abc2_";
|
||||
final String prefix3 = "abc3_";
|
||||
final Entry[] entries1 = new Entry[num];
|
||||
final Entry[] entries2 = new Entry[num];
|
||||
final Entry[] entries3 = new Entry[num];
|
||||
for (int i = 0; i < num; i++) {
|
||||
int weight = num - (i + 1);
|
||||
entries1[i] = new Entry(prefix1 + weight, weight);
|
||||
entries2[i] = new Entry(prefix2 + weight, weight);
|
||||
entries3[i] = new Entry(prefix3 + weight, weight);
|
||||
}
|
||||
for (int i = 0; i < num; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(newSuggestField("suggest_field_1", prefix1 + i, i));
|
||||
doc.add(newSuggestField("suggest_field_2", prefix2 + i, i));
|
||||
doc.add(newSuggestField("suggest_field_3", prefix3 + i, i));
|
||||
iw.addDocument(doc);
|
||||
|
||||
if (rarely()) {
|
||||
iw.commit();
|
||||
}
|
||||
}
|
||||
|
||||
DirectoryReader reader = iw.getReader();
|
||||
int numThreads = TestUtil.nextInt(random(), 2, 7);
|
||||
Thread threads[] = new Thread[numThreads];
|
||||
final CyclicBarrier startingGun = new CyclicBarrier(numThreads+1);
|
||||
final CopyOnWriteArrayList<Throwable> errors = new CopyOnWriteArrayList<>();
|
||||
final SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i] = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
startingGun.await();
|
||||
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field_1", prefix1, num);
|
||||
assertSuggestions(suggest, entries1);
|
||||
suggest = indexSearcher.suggest("suggest_field_2", prefix2, num);
|
||||
assertSuggestions(suggest, entries2);
|
||||
suggest = indexSearcher.suggest("suggest_field_3", prefix3, num);
|
||||
assertSuggestions(suggest, entries3);
|
||||
} catch (Throwable e) {
|
||||
errors.add(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
threads[i].start();
|
||||
}
|
||||
|
||||
startingGun.await();
|
||||
for (Thread t : threads) {
|
||||
t.join();
|
||||
}
|
||||
assertTrue(errors.toString(), errors.isEmpty());
|
||||
|
||||
reader.close();
|
||||
iw.close();
|
||||
}
|
||||
|
||||
private static Filter randomAccessFilter(Filter filter) {
|
||||
return new Filter() {
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException {
|
||||
DocIdSet docIdSet = filter.getDocIdSet(context, acceptDocs);
|
||||
DocIdSetIterator iterator = docIdSet.iterator();
|
||||
FixedBitSet bits = new FixedBitSet(context.reader().maxDoc());
|
||||
if (iterator != null) {
|
||||
int doc;
|
||||
while((doc = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
bits.set(doc);
|
||||
}
|
||||
}
|
||||
return new DocIdSet() {
|
||||
@Override
|
||||
public DocIdSetIterator iterator() throws IOException {
|
||||
return iterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Bits bits() throws IOException {
|
||||
return bits;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return docIdSet.ramBytesUsed();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return filter.toString(field);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static class Entry {
|
||||
private final String output;
|
||||
private final float value;
|
||||
|
||||
private Entry(String output, float value) {
|
||||
this.output = output;
|
||||
this.value = value;
|
||||
}
|
||||
}
|
||||
|
||||
private void assertSuggestions(TopDocs actual, Entry... expected) {
|
||||
SuggestScoreDoc[] suggestScoreDocs = (SuggestScoreDoc[]) actual.scoreDocs;
|
||||
assertThat(suggestScoreDocs.length, equalTo(expected.length));
|
||||
for (int i = 0; i < suggestScoreDocs.length; i++) {
|
||||
SuggestScoreDoc lookupDoc = suggestScoreDocs[i];
|
||||
assertThat(lookupDoc.key.toString(), equalTo(expected[i].output));
|
||||
assertThat(lookupDoc.score, equalTo(expected[i].value));
|
||||
}
|
||||
}
|
||||
|
||||
private SuggestField newSuggestField(String name, String value, long weight) throws IOException {
|
||||
return new SuggestField(name, value, weight);
|
||||
}
|
||||
|
||||
private IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, String... suggestFields) {
|
||||
return iwcWithSuggestField(analyzer, asSet(suggestFields));
|
||||
}
|
||||
|
||||
private IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, Set<String> suggestFields) {
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
Codec filterCodec = new Lucene50Codec() {
|
||||
PostingsFormat postingsFormat = new Completion50PostingsFormat();
|
||||
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
if (suggestFields.contains(field)) {
|
||||
return postingsFormat;
|
||||
}
|
||||
return super.getPostingsFormatForField(field);
|
||||
}
|
||||
};
|
||||
iwc.setCodec(filterCodec);
|
||||
return iwc;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue