LUCENE-6339: Added Near-real time Document Suggester via custom postings format

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1669698 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Areek Zillur 2015-03-27 22:37:49 +00:00
parent f49f2b2af6
commit b7adb6ac77
18 changed files with 3005 additions and 1 deletions

View File

@ -19,6 +19,9 @@ New Features
for counting ranges that align with the underlying terms as defined by the for counting ranges that align with the underlying terms as defined by the
NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley) NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley)
* LUCENE-6339: Added Near-real time Document Suggester via custom postings format
(Areek Zillur, Mike McCandless, Simon Willnauer)
API Changes API Changes
* LUCENE-3312: The API of oal.document was restructured to * LUCENE-3312: The API of oal.document was restructured to

View File

@ -49,7 +49,7 @@ public class FSTUtil {
public final FST.Arc<T> fstNode; public final FST.Arc<T> fstNode;
/** Output of the path so far: */ /** Output of the path so far: */
T output; public final T output;
/** Input of the path so far: */ /** Input of the path so far: */
public final IntsRefBuilder input; public final IntsRefBuilder input;

View File

@ -0,0 +1,42 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.PostingsFormat;
/**
* {@link org.apache.lucene.search.suggest.document.CompletionPostingsFormat}
* for {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat}
*
* @lucene.experimental
*/
public class Completion50PostingsFormat extends CompletionPostingsFormat {
/**
* Sole Constructor
*/
public Completion50PostingsFormat() {
super();
}
@Override
protected PostingsFormat delegatePostingsFormat() {
return PostingsFormat.forName("Lucene50");
}
}

View File

@ -0,0 +1,173 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
/**
* Wraps an {@link org.apache.lucene.analysis.Analyzer}
* to provide additional completion-only tuning
* (e.g. preserving token separators, preserving position increments while converting
* a token stream to an automaton)
* <p>
* Can be used to index {@link SuggestField}
* and as a query analyzer to {@link SuggestIndexSearcher}
* <p>
* NOTE: In most cases, index and query analyzer should have same values for {@link #preservePositionIncrements}
* and {@link #preserveSep}
*
* @lucene.experimental
*/
public class CompletionAnalyzer extends AnalyzerWrapper {
/**
* Represents the separation between tokens, if
* <code>preserveSep</code> is <code>true</code>
* <p>
* Same label is used as a delimiter in the {@link org.apache.lucene.search.suggest.document.CompletionTokenStream}
* payload
*/
final static int SEP_LABEL = NRTSuggesterBuilder.PAYLOAD_SEP;
/**
* Represent a hole character, inserted by {@link org.apache.lucene.analysis.TokenStreamToAutomaton}
*/
final static int HOLE_CHARACTER = TokenStreamToAutomaton.HOLE;
final static int DEFAULT_MAX_GRAPH_EXPANSIONS = -1;
final static boolean DEFAULT_PRESERVE_SEP = true;
final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
private final Analyzer analyzer;
/**
* Preserve separation between tokens
* when converting to an automaton
* <p>
* Defaults to <code>true</code>
*/
private final boolean preserveSep;
/**
* Preserve position increments for tokens
* when converting to an automaton
* <p>
* Defaults to <code>true</code>
*/
private final boolean preservePositionIncrements;
/**
* Sets the maximum number of graph expansions of a completion automaton
* <p>
* Defaults to <code>-1</code> (no limit)
*/
private final int maxGraphExpansions;
/**
* Wraps an analyzer to convert it's output token stream to an automaton
*
* @param analyzer token stream to be converted to an automaton
* @param preserveSep Preserve separation between tokens when converting to an automaton
* @param preservePositionIncrements Preserve position increments for tokens when converting to an automaton
* @param maxGraphExpansions Sets the maximum number of graph expansions of a completion automaton
*/
public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
super(PER_FIELD_REUSE_STRATEGY);
this.analyzer = analyzer;
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.maxGraphExpansions = maxGraphExpansions;
}
/**
* Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
* preserving token separation, position increments and no limit on graph expansions
*/
public CompletionAnalyzer(Analyzer analyzer) {
this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
}
/**
* Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
* with no limit on graph expansions
*/
public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements) {
this(analyzer, preserveSep, preservePositionIncrements, DEFAULT_MAX_GRAPH_EXPANSIONS);
}
/**
* Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
* preserving token separation and position increments
*/
public CompletionAnalyzer(Analyzer analyzer, int maxGraphExpansions) {
this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions);
}
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return analyzer;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
CompletionTokenStream tokenStream = new CompletionTokenStream(components.getTokenStream(),
preserveSep, preservePositionIncrements, SEP_LABEL, maxGraphExpansions);
return new TokenStreamComponents(components.getTokenizer(), tokenStream);
}
/**
* Converts <code>key</code> to an automaton using
* {@link #preservePositionIncrements}, {@link #preserveSep}
* and {@link #maxGraphExpansions}
*/
public Automaton toAutomaton(String field, CharSequence key) throws IOException {
for (int i = 0; i < key.length(); i++) {
switch (key.charAt(i)) {
case HOLE_CHARACTER:
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
case SEP_LABEL:
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
default:
break;
}
}
try (TokenStream tokenStream = analyzer.tokenStream(field, key.toString())) {
try(CompletionTokenStream stream = new CompletionTokenStream(tokenStream,
preserveSep, preservePositionIncrements, SEP_LABEL, maxGraphExpansions)) {
return stream.toAutomaton(tokenStream);
}
}
}
}

View File

@ -0,0 +1,192 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.CODEC_NAME;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
/**
* <p>
* Weighted FSTs for any indexed {@link SuggestField} is built on {@link #write(Fields)}.
* A weighted FST maps the analyzed forms of a field to its
* surface form and document id. FSTs are stored in the CompletionDictionary (.lkp).
* </p>
* <p>
* The file offsets of a field's FST are stored in the CompletionIndex (.cmp)
* along with the field's internal number {@link FieldInfo#number} on {@link #close()}.
* </p>
*
*/
final class CompletionFieldsConsumer extends FieldsConsumer {
private final String delegatePostingsFormatName;
private final Map<String, Long> seenFields = new HashMap<>();
private final SegmentWriteState state;
private IndexOutput dictOut;
private FieldsConsumer delegateFieldsConsumer;
CompletionFieldsConsumer(PostingsFormat delegatePostingsFormat, SegmentWriteState state) throws IOException {
this.delegatePostingsFormatName = delegatePostingsFormat.getName();
this.state = state;
String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
boolean success = false;
try {
this.delegateFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
dictOut = state.directory.createOutput(dictFile, state.context);
CodecUtil.writeIndexHeader(dictOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
}
}
}
@Override
public void write(Fields fields) throws IOException {
delegateFieldsConsumer.write(fields);
for (String field : fields) {
CompletionTermWriter termWriter = new CompletionTermWriter();
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator(null);
// write terms
BytesRef term;
while ((term = termsEnum.next()) != null) {
termWriter.write(term, termsEnum);
}
// store lookup, if needed
long filePointer = dictOut.getFilePointer();
if (termWriter.finish(dictOut)) {
seenFields.put(field, filePointer);
}
}
}
private boolean closed = false;
@Override
public void close() throws IOException {
if (closed) {
return;
}
closed = true;
String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
boolean success = false;
try (IndexOutput indexOut = state.directory.createOutput(indexFile, state.context)) {
delegateFieldsConsumer.close();
CodecUtil.writeIndexHeader(indexOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
/*
* we write the delegate postings format name so we can load it
* without getting an instance in the ctor
*/
indexOut.writeString(delegatePostingsFormatName);
// write # of seen fields
indexOut.writeVInt(seenFields.size());
// write field numbers and dictOut offsets
for (Map.Entry<String, Long> seenField : seenFields.entrySet()) {
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(seenField.getKey());
indexOut.writeVInt(fieldInfo.number);
indexOut.writeVLong(seenField.getValue());
}
CodecUtil.writeFooter(indexOut);
CodecUtil.writeFooter(dictOut);
IOUtils.close(dictOut);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
}
}
}
// builds an FST based on the terms written
private static class CompletionTermWriter {
private PostingsEnum postingsEnum = null;
private int docCount = 0;
private final BytesRefBuilder scratch = new BytesRefBuilder();
private final NRTSuggesterBuilder builder;
public CompletionTermWriter() {
builder = new NRTSuggesterBuilder();
}
/**
* Stores the built FST in <code>output</code>
* Returns true if there was anything stored, false otherwise
*/
public boolean finish(IndexOutput output) throws IOException {
boolean stored = builder.store(output);
assert stored || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
return stored;
}
/**
* Writes all postings (surface form, weight, document id) for <code>term</code>
*/
public void write(BytesRef term, TermsEnum termsEnum) throws IOException {
postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.PAYLOADS);
builder.startTerm(term);
int docFreq = 0;
while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int docID = postingsEnum.docID();
for (int i = 0; i < postingsEnum.freq(); i++) {
postingsEnum.nextPosition();
assert postingsEnum.getPayload() != null;
BytesRef payload = postingsEnum.getPayload();
ByteArrayDataInput input = new ByteArrayDataInput(payload.bytes, payload.offset, payload.length);
int len = input.readVInt();
scratch.grow(len);
scratch.setLength(len);
input.readBytes(scratch.bytes(), 0, scratch.length());
builder.addEntry(docID, scratch.get(), input.readVLong() - 1);
}
docFreq++;
docCount = Math.max(docCount, docFreq + 1);
}
builder.finishTerm();
}
}
}

View File

@ -0,0 +1,228 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.IOUtils;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.CODEC_NAME;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_CODEC_VERSION;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
/**
* <p>
* Completion index (.cmp) is opened and read at instantiation to read in {@link SuggestField}
* numbers and their FST offsets in the Completion dictionary (.lkp).
* </p>
* <p>
* Completion dictionary (.lkp) is opened at instantiation and a field's FST is loaded
* into memory the first time it is requested via {@link #terms(String)}.
* </p>
* <p>
* NOTE: Only the footer is validated for Completion dictionary (.lkp) and not the checksum due
* to random access pattern and checksum validation being too costly at instantiation
* </p>
*
*/
final class CompletionFieldsProducer extends FieldsProducer {
private FieldsProducer delegateFieldsProducer;
private Map<String, CompletionsTermsReader> readers;
private IndexInput dictIn;
// copy ctr for merge instance
private CompletionFieldsProducer(FieldsProducer delegateFieldsProducer, Map<String, CompletionsTermsReader> readers) {
this.delegateFieldsProducer = delegateFieldsProducer;
this.readers = readers;
}
CompletionFieldsProducer(SegmentReadState state) throws IOException {
String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
delegateFieldsProducer = null;
boolean success = false;
try (ChecksumIndexInput index = state.directory.openChecksumInput(indexFile, state.context)) {
// open up dict file containing all fsts
String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
dictIn = state.directory.openInput(dictFile, state.context);
CodecUtil.checkIndexHeader(dictIn, CODEC_NAME, COMPLETION_CODEC_VERSION, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
// just validate the footer for the dictIn
CodecUtil.retrieveChecksum(dictIn);
// open up index file (fieldNumber, offset)
CodecUtil.checkIndexHeader(index, CODEC_NAME, COMPLETION_CODEC_VERSION, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
// load delegate PF
PostingsFormat delegatePostingsFormat = PostingsFormat.forName(index.readString());
delegateFieldsProducer = delegatePostingsFormat.fieldsProducer(state);
// read suggest field numbers and their offsets in the terms file from index
int numFields = index.readVInt();
readers = new HashMap<>(numFields);
for (int i = 0; i < numFields; i++) {
int fieldNumber = index.readVInt();
long offset = index.readVLong();
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNumber);
// we don't load the FST yet
readers.put(fieldInfo.name, new CompletionsTermsReader(offset));
}
CodecUtil.checkFooter(index);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(delegateFieldsProducer, dictIn);
}
}
}
@Override
public void close() throws IOException {
boolean success = false;
try {
delegateFieldsProducer.close();
IOUtils.close(dictIn);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(delegateFieldsProducer, dictIn);
}
}
}
@Override
public void checkIntegrity() throws IOException {
delegateFieldsProducer.checkIntegrity();
// TODO: checkIntegrity should checksum the dictionary and index
}
@Override
public FieldsProducer getMergeInstance() throws IOException {
return new CompletionFieldsProducer(delegateFieldsProducer, readers);
}
@Override
public long ramBytesUsed() {
long ramBytesUsed = delegateFieldsProducer.ramBytesUsed();
for (CompletionsTermsReader reader : readers.values()) {
ramBytesUsed += reader.ramBytesUsed();
}
return ramBytesUsed;
}
@Override
public Collection<Accountable> getChildResources() {
List<Accountable> accountableList = new ArrayList<>();
for (Map.Entry<String, CompletionsTermsReader> readerEntry : readers.entrySet()) {
accountableList.add(Accountables.namedAccountable(readerEntry.getKey(), readerEntry.getValue()));
}
return Collections.unmodifiableCollection(accountableList);
}
@Override
public Iterator<String> iterator() {
return readers.keySet().iterator();
}
@Override
public Terms terms(String field) throws IOException {
return new CompletionTerms(delegateFieldsProducer.terms(field), readers.get(field));
}
@Override
public int size() {
return readers.size();
}
private class CompletionsTermsReader implements Accountable {
private final long offset;
private NRTSuggester suggester;
public CompletionsTermsReader(long offset) throws IOException {
assert offset >= 0l && offset < dictIn.length();
this.offset = offset;
}
public synchronized NRTSuggester suggester() throws IOException {
if (suggester == null) {
try (IndexInput dictClone = dictIn.clone()) { // let multiple fields load concurrently
dictClone.seek(offset);
suggester = NRTSuggester.load(dictClone);
}
}
return suggester;
}
@Override
public long ramBytesUsed() {
return (suggester != null) ? suggester.ramBytesUsed() : 0;
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.emptyList();
}
}
/**
* Thin wrapper over {@link org.apache.lucene.index.Terms} with
* a {@link NRTSuggester}
*/
public static class CompletionTerms extends FilterLeafReader.FilterTerms {
private final CompletionsTermsReader reader;
public CompletionTerms(Terms in, CompletionsTermsReader reader) {
super(in);
this.reader = reader;
}
/**
* Returns a {@link NRTSuggester} for the field
* or <code>null</code> if no FST
* was indexed for this field
*/
public NRTSuggester suggester() throws IOException {
if (reader == null) {
return null;
}
return reader.suggester();
}
}
}

View File

@ -0,0 +1,121 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.fst.FST;
/**
* <p>
* A {@link PostingsFormat} which supports document suggestion based on
* indexed {@link SuggestField}s.
* Document suggestion is based on an weighted FST which map analyzed
* terms of a {@link SuggestField} to its surface form and document id.
* </p>
* <p>
* Files:
* <ul>
* <li><tt>.lkp</tt>: <a href="#Completiondictionary">Completion Dictionary</a></li>
* <li><tt>.cmp</tt>: <a href="#Completionindex">Completion Index</a></li>
* </ul>
* <p>
* <a name="Completionictionary"></a>
* <h3>Completion Dictionary</h3>
* <p>The .lkp file contains an FST for each suggest field
* </p>
* <ul>
* <li>CompletionDict (.lkp) --&gt; Header, FST<sup>NumSuggestFields</sup>, Footer</li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <!-- TODO: should the FST output be mentioned at all? -->
* <li>FST --&gt; {@link FST FST&lt;Long, BytesRef&gt;}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* <p>Notes:</p>
* <ul>
* <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
* for the Completion implementation.</li>
* <li>FST maps all analyzed forms to surface forms of a SuggestField</li>
* </ul>
* <a name="Completionindex"></a>
* <h3>Completion Index</h3>
* <p>The .cmp file contains an index into the completion dictionary, so that it can be
* accessed randomly.</p>
* <ul>
* <li>CompletionIndex (.cmp) --&gt; Header, NumSuggestFields, Entry<sup>NumSuggestFields</sup>, Footer</li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>NumSuggestFields --&gt; {@link DataOutput#writeVInt Uint32}</li>
* <li>Entry --&gt; FieldNumber, CompletionDictionaryOffset</li>
* <li>FieldNumber --&gt; {@link DataOutput#writeVInt Uint32}</li>
* <li>CompletionDictionaryOffset --&gt; {@link DataOutput#writeVLong Uint64}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* <p>Notes:</p>
* <ul>
* <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
* for the Completion implementation.</li>
* <li>NumSuggestFields is the number of suggest fields indexed</li>
* <li>FieldNumber is the fields number from {@link FieldInfos}. (.fnm)</li>
* <li>CompletionDictionaryOffset is the file offset of a field's FST in CompletionDictionary (.lkp)</li>
* </ul>
*
* @lucene.experimental
*/
public abstract class CompletionPostingsFormat extends PostingsFormat {
static final String CODEC_NAME = "completion";
static final int COMPLETION_CODEC_VERSION = 1;
static final int COMPLETION_VERSION_CURRENT = COMPLETION_CODEC_VERSION;
static final String INDEX_EXTENSION = "cmp";
static final String DICT_EXTENSION = "lkp";
/**
* Used only by core Lucene at read-time via Service Provider instantiation
*/
public CompletionPostingsFormat() {
super(CODEC_NAME);
}
/**
* Concrete implementation should specify the delegating postings format
*/
protected abstract PostingsFormat delegatePostingsFormat();
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsFormat delegatePostingsFormat = delegatePostingsFormat();
if (delegatePostingsFormat == null) {
throw new UnsupportedOperationException("Error - " + getClass().getName()
+ " has been constructed without a choice of PostingsFormat");
}
return new CompletionFieldsConsumer(delegatePostingsFormat, state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
return new CompletionFieldsProducer(state);
}
}

View File

@ -0,0 +1,358 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Util;
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS;
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS;
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP;
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL;
/**
* Token stream which converts a provided token stream to an automaton.
* The accepted strings enumeration from the automaton are available through the
* {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
* The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
* a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)})
*
*/
final class CompletionTokenStream extends TokenStream {
private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);
private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
private final TokenStream input;
private final boolean preserveSep;
private final boolean preservePositionIncrements;
private final int sepLabel;
private final int maxGraphExpansions;
private BytesRef payload;
private Iterator<IntsRef> finiteStrings;
private int posInc = -1;
private CharTermAttribute charTermAttribute;
/**
* Creates a token stream to convert <code>input</code> to a token stream
* of accepted strings by its automaton.
* <p>
* The token stream <code>input</code> is converted to an automaton
* with the default settings of {@link org.apache.lucene.search.suggest.document.CompletionAnalyzer}
*/
public CompletionTokenStream(TokenStream input) {
this(input, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, SEP_LABEL, DEFAULT_MAX_GRAPH_EXPANSIONS);
}
CompletionTokenStream(TokenStream input, boolean preserveSep, boolean preservePositionIncrements, int sepLabel, int maxGraphExpansions) {
// Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
// the input stream entirely in toFiniteStrings(input)
this.input = input;
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.sepLabel = sepLabel;
this.maxGraphExpansions = maxGraphExpansions;
}
/**
* Returns a separator label that is reserved for the payload
* in {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)}
*/
public int sepLabel() {
return sepLabel;
}
/**
* Sets a payload available throughout successive token stream enumeration
*/
public void setPayload(BytesRef payload) {
this.payload = payload;
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
if (finiteStrings == null) {
//TODO: make this return a Iterator<IntsRef> instead?
Automaton automaton = toAutomaton(input);
Set<IntsRef> strings = Operations.getFiniteStrings(automaton, maxGraphExpansions);
posInc = strings.size();
finiteStrings = strings.iterator();
}
if (finiteStrings.hasNext()) {
posAttr.setPositionIncrement(posInc);
/*
* this posInc encodes the number of paths that this surface form
* produced. Multi Fields have the same surface form and therefore sum up
*/
posInc = 0;
Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8
if (charTermAttribute != null) {
charTermAttribute.setLength(0);
charTermAttribute.append(bytesAtt.toUTF16());
}
if (payload != null) {
payloadAttr.setPayload(this.payload);
}
return true;
}
return false;
}
@Override
public void end() throws IOException {
super.end();
if (posInc == -1) {
input.end();
}
}
@Override
public void close() throws IOException {
if (posInc == -1) {
input.close();
}
}
@Override
public void reset() throws IOException {
super.reset();
if (hasAttribute(CharTermAttribute.class)) {
// we only create this if we really need it to safe the UTF-8 to UTF-16 conversion
charTermAttribute = getAttribute(CharTermAttribute.class);
}
finiteStrings = null;
posInc = -1;
}
/**
* Converts <code>tokenStream</code> to an automaton
*/
public Automaton toAutomaton(TokenStream tokenStream) throws IOException {
// TODO refactor this
// maybe we could hook up a modified automaton from TermAutomatonQuery here?
Automaton automaton = null;
try {
// Create corresponding automaton: labels are bytes
// from each analyzed token, with byte 0 used as
// separator between tokens:
final TokenStreamToAutomaton tsta;
if (preserveSep) {
tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL);
} else {
// When we're not preserving sep, we don't steal 0xff
// byte, so we don't need to do any escaping:
tsta = new TokenStreamToAutomaton();
}
tsta.setPreservePositionIncrements(preservePositionIncrements);
automaton = tsta.toAutomaton(tokenStream);
} finally {
IOUtils.closeWhileHandlingException(tokenStream);
}
// TODO: we can optimize this somewhat by determinizing
// while we convert
automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
// This automaton should not blow up during determinize:
return Operations.determinize(automaton, maxGraphExpansions);
}
/**
* Just escapes the 0xff byte (which we still for SEP).
*/
private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
final BytesRefBuilder spare = new BytesRefBuilder();
private char sepLabel;
public EscapingTokenStreamToAutomaton(char sepLabel) {
this.sepLabel = sepLabel;
}
@Override
protected BytesRef changeToken(BytesRef in) {
int upto = 0;
for (int i = 0; i < in.length; i++) {
byte b = in.bytes[in.offset + i];
if (b == (byte) sepLabel) {
spare.grow(upto + 2);
spare.setByteAt(upto++, (byte) sepLabel);
spare.setByteAt(upto++, b);
} else {
spare.grow(upto + 1);
spare.setByteAt(upto++, b);
}
}
spare.setLength(upto);
return spare.get();
}
}
// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
Automaton result = new Automaton();
// Copy all states over
int numStates = a.getNumStates();
for (int s = 0; s < numStates; s++) {
result.createState();
result.setAccept(s, a.isAccept(s));
}
// Go in reverse topo sort so we know we only have to
// make one pass:
Transition t = new Transition();
int[] topoSortStates = topoSortStates(a);
for (int i = 0; i < topoSortStates.length; i++) {
int state = topoSortStates[topoSortStates.length - 1 - i];
int count = a.initTransition(state, t);
for (int j = 0; j < count; j++) {
a.getNextTransition(t);
if (t.min == TokenStreamToAutomaton.POS_SEP) {
assert t.max == TokenStreamToAutomaton.POS_SEP;
if (preserveSep) {
// Remap to SEP_LABEL:
result.addTransition(state, t.dest, sepLabel);
} else {
result.addEpsilon(state, t.dest);
}
} else if (t.min == TokenStreamToAutomaton.HOLE) {
assert t.max == TokenStreamToAutomaton.HOLE;
// Just remove the hole: there will then be two
// SEP tokens next to each other, which will only
// match another hole at search time. Note that
// it will also match an empty-string token ... if
// that's somehow a problem we can always map HOLE
// to a dedicated byte (and escape it in the
// input).
result.addEpsilon(state, t.dest);
} else {
result.addTransition(state, t.dest, t.min, t.max);
}
}
}
result.finishState();
return result;
}
private static int[] topoSortStates(Automaton a) {
int[] states = new int[a.getNumStates()];
final Set<Integer> visited = new HashSet<>();
final LinkedList<Integer> worklist = new LinkedList<>();
worklist.add(0);
visited.add(0);
int upto = 0;
states[upto] = 0;
upto++;
Transition t = new Transition();
while (worklist.size() > 0) {
int s = worklist.removeFirst();
int count = a.initTransition(s, t);
for (int i = 0; i < count; i++) {
a.getNextTransition(t);
if (!visited.contains(t.dest)) {
visited.add(t.dest);
worklist.add(t.dest);
states[upto++] = t.dest;
}
}
}
return states;
}
public interface ByteTermAttribute extends TermToBytesRefAttribute {
// marker interface
/**
* Return the builder from which the term is derived.
*/
public BytesRefBuilder builder();
public CharSequence toUTF16();
}
public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute {
private final BytesRefBuilder bytes = new BytesRefBuilder();
private CharsRefBuilder charsRef;
@Override
public void fillBytesRef() {
// does nothing - we change in place
}
@Override
public BytesRefBuilder builder() {
return bytes;
}
@Override
public BytesRef getBytesRef() {
return bytes.get();
}
@Override
public void clear() {
bytes.clear();
}
@Override
public void copyTo(AttributeImpl target) {
ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
other.bytes.copyBytes(bytes);
}
@Override
public CharSequence toUTF16() {
if (charsRef == null) {
charsRef = new CharsRefBuilder();
}
charsRef.copyUTF8Bytes(getBytesRef());
return charsRef.get();
}
}
}

View File

@ -0,0 +1,324 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.CollectionTerminatedException;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.suggest.analyzing.FSTUtil;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseDocID;
import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseSurfaceForm;
/**
* <p>
* NRTSuggester returns Top N completions with corresponding documents matching a provided automaton.
* The completions are returned in descending order of their corresponding weight.
* Deleted documents are filtered out in near real time using the provided reader.
* A {@link org.apache.lucene.search.DocIdSet} can be passed in at query time to filter out documents.
* </p>
* <p>
* See {@link #lookup(LeafReader, Automaton, int, DocIdSet, TopSuggestDocsCollector)} for more implementation
* details.
* <p>
* Builder: {@link NRTSuggesterBuilder}
* </p>
* <p>
* FST Format:
* <ul>
* <li>Input: analyzed forms of input terms</li>
* <li>Output: Pair&lt;Long, BytesRef&gt; containing weight, surface form and docID</li>
* </ul>
* <p>
* NOTE:
* <ul>
* <li>currently only {@link org.apache.lucene.search.DocIdSet} with random access capabilities are supported.</li>
* <li>having too many deletions or using a very restrictive filter can make the search inadmissible due to
* over-pruning of potential paths</li>
* <li>when a {@link org.apache.lucene.search.DocIdSet} is used, it is assumed that the filter will roughly
* filter out half the number of documents that match the provided automaton</li>
* <li>lookup performance will degrade as more accepted completions lead to filtered out documents</li>
* </ul>
*
*/
final class NRTSuggester implements Accountable {
/**
* FST<Weight,Surface>:
* input is the analyzed form, with a null byte between terms
* and a {@link NRTSuggesterBuilder#END_BYTE} to denote the
* end of the input
* weight is a long
* surface is the original, unanalyzed form followed by the docID
*/
private final FST<Pair<Long, BytesRef>> fst;
/**
* Highest number of analyzed paths we saw for any single
* input surface form. This can be > 1, when index analyzer
* creates graphs or if multiple surface form(s) yields the
* same analyzed form
*/
private final int maxAnalyzedPathsPerOutput;
/**
* Separator used between surface form and its docID in the FST output
*/
private final int payloadSep;
/**
* Label used to denote the end of an input in the FST and
* the beginning of dedup bytes
*/
private final int endByte;
/**
* Maximum queue depth for TopNSearcher
*
* NOTE: value should be <= Integer.MAX_VALUE
*/
private static final long MAX_TOP_N_QUEUE_SIZE = 1000;
private NRTSuggester(FST<Pair<Long, BytesRef>> fst, int maxAnalyzedPathsPerOutput, int payloadSep, int endByte) {
this.fst = fst;
this.maxAnalyzedPathsPerOutput = maxAnalyzedPathsPerOutput;
this.payloadSep = payloadSep;
this.endByte = endByte;
}
@Override
public long ramBytesUsed() {
return fst == null ? 0 : fst.ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.emptyList();
}
private static Comparator<Pair<Long, BytesRef>> getComparator() {
return new Comparator<Pair<Long, BytesRef>>() {
@Override
public int compare(Pair<Long, BytesRef> o1, Pair<Long, BytesRef> o2) {
return Long.compare(o1.output1, o2.output1);
}
};
}
/**
* Collects at most Top <code>num</code> completions, filtered by <code>filter</code> on
* corresponding documents, which has a prefix accepted by <code>automaton</code>
* <p>
* Supports near real time deleted document filtering using <code>reader</code>
* <p>
* {@link TopSuggestDocsCollector#collect(int, CharSequence, long)} is called
* for every matched completion
* <p>
* Completion collection can be early terminated by throwing {@link org.apache.lucene.search.CollectionTerminatedException}
*/
public void lookup(final LeafReader reader, final Automaton automaton, final int num, final DocIdSet filter, final TopSuggestDocsCollector collector) {
final Bits filterDocs;
try {
if (filter != null) {
if (filter.iterator() == null) {
return;
}
if (filter.bits() == null) {
throw new IllegalArgumentException("DocIDSet does not provide random access interface");
} else {
filterDocs = filter.bits();
}
} else {
filterDocs = null;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
int queueSize = getMaxTopNSearcherQueueSize(num, reader, filterDocs != null);
if (queueSize == -1) {
return;
}
final Bits liveDocs = reader.getLiveDocs();
try {
final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst);
Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num, queueSize, getComparator()) {
private final CharsRefBuilder spare = new CharsRefBuilder();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
int payloadSepIndex = parseSurfaceForm(output.output2, payloadSep, spare);
int docID = parseDocID(output.output2, payloadSepIndex);
// filter out deleted docs only if no filter is set
if (filterDocs == null && liveDocs != null && !liveDocs.get(docID)) {
return false;
}
// filter by filter context
if (filterDocs != null && !filterDocs.get(docID)) {
return false;
}
try {
collector.collect(docID, spare.toCharsRef(), decode(output.output1));
return true;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
// TODO: add fuzzy support
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, false, path.input);
}
try {
// hits are also returned by search()
// we do not use it, instead collect at acceptResult
Util.TopResults<Pair<Long, BytesRef>> search = searcher.search();
// search admissibility is not guaranteed
// see comment on getMaxTopNSearcherQueueSize
// assert search.isComplete;
} catch (CollectionTerminatedException e) {
// terminate
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
/**
* Simple heuristics to try to avoid over-pruning potential suggestions by the
* TopNSearcher. Since suggestion entries can be rejected if they belong
* to a deleted document, the length of the TopNSearcher queue has to
* be increased by some factor, to account for the filtered out suggestions.
* This heuristic will try to make the searcher admissible, but the search
* can still lead to over-pruning
* <p>
* If a <code>filter</code> is applied, the queue size is increased by
* half the number of live documents.
* <p>
* The maximum queue size is {@link #MAX_TOP_N_QUEUE_SIZE}
*/
private int getMaxTopNSearcherQueueSize(int num, LeafReader reader, boolean filterEnabled) {
double liveDocsRatio = calculateLiveDocRatio(reader.numDocs(), reader.maxDoc());
if (liveDocsRatio == -1) {
return -1;
}
long maxQueueSize = num * maxAnalyzedPathsPerOutput;
// liveDocRatio can be at most 1.0 (if no docs were deleted)
assert liveDocsRatio <= 1.0d;
maxQueueSize = (long) (maxQueueSize / liveDocsRatio);
if (filterEnabled) {
maxQueueSize = maxQueueSize + (reader.numDocs()/2);
}
return (int) Math.min(MAX_TOP_N_QUEUE_SIZE, maxQueueSize);
}
private static double calculateLiveDocRatio(int numDocs, int maxDocs) {
return (numDocs > 0) ? ((double) numDocs / maxDocs) : -1;
}
/**
* Loads a {@link NRTSuggester} from {@link org.apache.lucene.store.IndexInput}
*/
public static NRTSuggester load(IndexInput input) throws IOException {
final FST<Pair<Long, BytesRef>> fst = new FST<>(input, new PairOutputs<>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
/* read some meta info */
int maxAnalyzedPathsPerOutput = input.readVInt();
int endByte = input.readVInt();
int payloadSep = input.readVInt();
return new NRTSuggester(fst, maxAnalyzedPathsPerOutput, payloadSep, endByte);
}
static long encode(long input) {
if (input < 0) {
throw new UnsupportedOperationException("cannot encode value: " + input);
}
return Long.MAX_VALUE - input;
}
static long decode(long output) {
return (Long.MAX_VALUE - output);
}
/**
* Helper to encode/decode payload (surface + PAYLOAD_SEP + docID) output
*/
static final class PayLoadProcessor {
final static private int MAX_DOC_ID_LEN_WITH_SEP = 6; // vint takes at most 5 bytes
static int parseSurfaceForm(final BytesRef output, int payloadSep, CharsRefBuilder spare) {
int surfaceFormLen = -1;
for (int i = 0; i < output.length; i++) {
if (output.bytes[output.offset + i] == payloadSep) {
surfaceFormLen = i;
break;
}
}
assert surfaceFormLen != -1 : "no payloadSep found, unable to determine surface form";
spare.copyUTF8Bytes(output.bytes, output.offset, surfaceFormLen);
return surfaceFormLen;
}
static int parseDocID(final BytesRef output, int payloadSepIndex) {
assert payloadSepIndex != -1 : "payload sep index can not be -1";
ByteArrayDataInput input = new ByteArrayDataInput(output.bytes, payloadSepIndex + output.offset + 1, output.length - (payloadSepIndex + output.offset));
return input.readVInt();
}
static BytesRef make(final BytesRef surface, int docID, int payloadSep) throws IOException {
int len = surface.length + MAX_DOC_ID_LEN_WITH_SEP;
byte[] buffer = new byte[len];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
output.writeBytes(surface.bytes, surface.length - surface.offset);
output.writeByte((byte) payloadSep);
output.writeVInt(docID);
return new BytesRef(buffer, 0, output.getPosition());
}
}
}

View File

@ -0,0 +1,165 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.PriorityQueue;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import static org.apache.lucene.search.suggest.document.NRTSuggester.encode;
/**
* Builder for {@link NRTSuggester}
*
*/
final class NRTSuggesterBuilder {
/**
* Label used to separate surface form and docID
* in the output
*/
public static final int PAYLOAD_SEP = '\u001F';
/**
* Marks end of the analyzed input and start of dedup
* byte.
*/
private static final int END_BYTE = 0x0;
private final PairOutputs<Long, BytesRef> outputs;
private final Builder<PairOutputs.Pair<Long, BytesRef>> builder;
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
private final BytesRefBuilder analyzed = new BytesRefBuilder();
private final PriorityQueue<Entry> entries;
private final int payloadSep;
private final int endByte;
private int maxAnalyzedPathsPerOutput = 0;
/**
* Create a builder for {@link NRTSuggester}
*/
public NRTSuggesterBuilder() {
this.payloadSep = PAYLOAD_SEP;
this.endByte = END_BYTE;
this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
this.entries = new PriorityQueue<>();
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
}
/**
* Initializes an FST input term to add entries against
*/
public void startTerm(BytesRef analyzed) {
this.analyzed.copyBytes(analyzed);
this.analyzed.append((byte) endByte);
}
/**
* Adds an entry for the latest input term, should be called after
* {@link #startTerm(org.apache.lucene.util.BytesRef)} on the desired input
*/
public void addEntry(int docID, BytesRef surfaceForm, long weight) throws IOException {
BytesRef payloadRef = NRTSuggester.PayLoadProcessor.make(surfaceForm, docID, payloadSep);
entries.add(new Entry(payloadRef, encode(weight)));
}
/**
* Writes all the entries for the FST input term
*/
public void finishTerm() throws IOException {
int numArcs = 0;
int numDedupBytes = 1;
analyzed.grow(analyzed.length() + 1);
analyzed.setLength(analyzed.length() + 1);
for (Entry entry : entries) {
if (numArcs == maxNumArcsForDedupByte(numDedupBytes)) {
analyzed.setByteAt(analyzed.length() - 1, (byte) (numArcs));
analyzed.grow(analyzed.length() + 1);
analyzed.setLength(analyzed.length() + 1);
numArcs = 0;
numDedupBytes++;
}
analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
Util.toIntsRef(analyzed.get(), scratchInts);
builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
}
maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
entries.clear();
}
/**
* Builds and stores a FST that can be loaded with
* {@link NRTSuggester#load(org.apache.lucene.store.IndexInput)}
*/
public boolean store(DataOutput output) throws IOException {
final FST<PairOutputs.Pair<Long, BytesRef>> build = builder.finish();
if (build == null) {
return false;
}
build.save(output);
/* write some more meta-info */
assert maxAnalyzedPathsPerOutput > 0;
output.writeVInt(maxAnalyzedPathsPerOutput);
output.writeVInt(END_BYTE);
output.writeVInt(PAYLOAD_SEP);
return true;
}
/**
* Num arcs for nth dedup byte:
* if n <= 5: 1 + (2 * n)
* else: (1 + (2 * n)) * n
* <p>
* TODO: is there a better way to make the fst built to be
* more TopNSearcher friendly?
*/
private static int maxNumArcsForDedupByte(int currentNumDedupBytes) {
int maxArcs = 1 + (2 * currentNumDedupBytes);
if (currentNumDedupBytes > 5) {
maxArcs *= currentNumDedupBytes;
}
return Math.min(maxArcs, 255);
}
private final static class Entry implements Comparable<Entry> {
final BytesRef payload;
final long weight;
public Entry(BytesRef payload, long weight) {
this.payload = payload;
this.weight = weight;
}
@Override
public int compareTo(Entry o) {
return Long.compare(weight, o.weight);
}
}
}

View File

@ -0,0 +1,123 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.BytesRef;
/**
* <p>
* Field that indexes a string value and a weight as a weighted completion
* against a named suggester.
* Field is tokenized, not stored and stores documents, frequencies and positions.
* Field can be used to provide near real time document suggestions.
* </p>
* <p>
* Besides the usual {@link org.apache.lucene.analysis.Analyzer}s,
* {@link CompletionAnalyzer}
* can be used to tune suggest field only parameters
* (e.g. preserving token seperators, preserving position increments
* when converting the token stream to an automaton)
* </p>
* <p>
* Example indexing usage:
* <pre class="prettyprint">
* document.add(new SuggestField(name, "suggestion", 4));
* </pre>
* To perform document suggestions based on the this field, use
* {@link SuggestIndexSearcher#suggest(String, CharSequence, int, org.apache.lucene.search.Filter)}
* <p>
* Example query usage:
* <pre class="prettyprint">
* SuggestIndexSearcher indexSearcher = ..
* indexSearcher.suggest(name, "su", 2)
* </pre>
*
* @lucene.experimental
*/
public class SuggestField extends Field {
private static final FieldType FIELD_TYPE = new FieldType();
static {
FIELD_TYPE.setTokenized(true);
FIELD_TYPE.setStored(false);
FIELD_TYPE.setStoreTermVectors(false);
FIELD_TYPE.setOmitNorms(false);
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
FIELD_TYPE.freeze();
}
private final BytesRef surfaceForm;
private final long weight;
/**
* Creates a {@link SuggestField}
*
* @param name of the field
* @param value to get suggestions on
* @param weight weight of the suggestion
*/
public SuggestField(String name, String value, long weight) {
super(name, value, FIELD_TYPE);
if (weight < 0l) {
throw new IllegalArgumentException("weight must be >= 0");
}
this.surfaceForm = new BytesRef(value);
this.weight = weight;
}
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
TokenStream stream = super.tokenStream(analyzer, reuse);
CompletionTokenStream completionStream;
if (stream instanceof CompletionTokenStream) {
completionStream = (CompletionTokenStream) stream;
} else {
completionStream = new CompletionTokenStream(stream);
}
BytesRef suggestPayload = buildSuggestPayload(surfaceForm, weight, (char) completionStream.sepLabel());
completionStream.setPayload(suggestPayload);
return completionStream;
}
private BytesRef buildSuggestPayload(BytesRef surfaceForm, long weight, char sepLabel) throws IOException {
for (int i = 0; i < surfaceForm.length; i++) {
if (surfaceForm.bytes[i] == sepLabel) {
assert sepLabel == '\u001f';
throw new IllegalArgumentException(
"surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
output.writeVInt(surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeVLong(weight + 1);
}
return new BytesRef(byteArrayOutputStream.toByteArray());
}
}

View File

@ -0,0 +1,150 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.automaton.Automaton;
import static org.apache.lucene.search.suggest.document.CompletionFieldsProducer.CompletionTerms;
/**
* Adds document suggest capabilities to IndexSearcher
*
* @lucene.experimental
*/
public class SuggestIndexSearcher extends IndexSearcher {
private final Analyzer queryAnalyzer;
/**
* Creates a searcher with document suggest capabilities
* for <code>reader</code>.
* <p>
* Suggestion <code>key</code> is analyzed with <code>queryAnalyzer</code>
*/
public SuggestIndexSearcher(IndexReader reader, Analyzer queryAnalyzer) {
super(reader);
this.queryAnalyzer = queryAnalyzer;
}
/**
* Calls {@link #suggest(String, CharSequence, int, Filter)}
* with no document filter
*/
public TopSuggestDocs suggest(String field, CharSequence key, int num) throws IOException {
return suggest(field, key, num, (Filter) null);
}
/**
* Calls {@link #suggest(String, CharSequence, int, Filter, TopSuggestDocsCollector)}
* with no document filter
*/
public void suggest(String field, CharSequence key, int num, TopSuggestDocsCollector collector) throws IOException {
suggest(field, key, num, null, collector);
}
/**
* Suggests at most <code>num</code> documents filtered by <code>filter</code>
* that completes to <code>key</code> for a suggest <code>field</code>
* <p>
* Returns at most Top <code>num</code> document ids with corresponding completion and weight pair
*
* @throws java.lang.IllegalArgumentException if <code>filter</code> does not provide a random access
* interface or if <code>field</code> is not a {@link SuggestField}
*/
public TopSuggestDocs suggest(String field, CharSequence key, int num, Filter filter) throws IOException {
TopSuggestDocsCollector collector = new TopSuggestDocsCollector(num);
suggest(field, key, num, filter, collector);
return collector.get();
}
/**
* Suggests at most <code>num</code> documents filtered by <code>filter</code>
* that completes to <code>key</code> for a suggest <code>field</code>
* <p>
* Collect completions with {@link TopSuggestDocsCollector}
* The completions are collected in order of the suggest <code>field</code> weight.
* There can be more than one collection of the same document, if the <code>key</code>
* matches multiple <code>field</code> values of the same document
*
* @throws java.lang.IllegalArgumentException if <code>filter</code> does not provide a random access
* interface or if <code>field</code> is not a {@link SuggestField}
*/
public void suggest(String field, CharSequence key, int num, Filter filter, TopSuggestDocsCollector collector) throws IOException {
// verify input
if (field == null) {
throw new IllegalArgumentException("'field' can not be null");
}
if (num <= 0) {
throw new IllegalArgumentException("'num' should be > 0");
}
if (collector == null) {
throw new IllegalArgumentException("'collector' can not be null");
}
// build query automaton
CompletionAnalyzer analyzer;
if (queryAnalyzer instanceof CompletionAnalyzer) {
analyzer = (CompletionAnalyzer) queryAnalyzer;
} else {
analyzer = new CompletionAnalyzer(queryAnalyzer);
}
final Automaton automaton = analyzer.toAutomaton(field, key);
// collect results
for (LeafReaderContext context : getIndexReader().leaves()) {
TopSuggestDocsCollector leafCollector = (TopSuggestDocsCollector) collector.getLeafCollector(context);
LeafReader reader = context.reader();
Terms terms = reader.terms(field);
if (terms == null) {
continue;
}
NRTSuggester suggester;
if (terms instanceof CompletionTerms) {
CompletionTerms completionTerms = (CompletionTerms) terms;
suggester = completionTerms.suggester();
} else {
throw new IllegalArgumentException(field + " is not a SuggestField");
}
if (suggester == null) {
// a segment can have a null suggester
// i.e. no FST was built
continue;
}
DocIdSet docIdSet = null;
if (filter != null) {
docIdSet = filter.getDocIdSet(context, reader.getLiveDocs());
if (docIdSet == null) {
// filter matches no docs in current leave
continue;
}
}
suggester.lookup(reader, automaton, num, docIdSet, leafCollector);
}
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;
import org.apache.lucene.util.PriorityQueue;
/**
* Bounded priority queue for {@link SuggestScoreDoc}s.
* Priority is based on {@link SuggestScoreDoc#score} and tie
* is broken by {@link SuggestScoreDoc#doc}
*/
final class SuggestScoreDocPriorityQueue extends PriorityQueue<SuggestScoreDoc> {
/**
* Creates a new priority queue of the specified size.
*/
public SuggestScoreDocPriorityQueue(int size) {
super(size);
}
@Override
protected boolean lessThan(SuggestScoreDoc a, SuggestScoreDoc b) {
if (a.score == b.score) {
// prefer smaller doc id, in case of a tie
return a.doc > b.doc;
}
return a.score < b.score;
}
/**
* Returns the top N results in descending order.
*/
public SuggestScoreDoc[] getResults() {
int size = size();
SuggestScoreDoc[] res = new SuggestScoreDoc[size];
for (int i = size - 1; i >= 0; i--) {
res[i] = pop();
}
return res;
}
}

View File

@ -0,0 +1,111 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.suggest.Lookup;
/**
* {@link org.apache.lucene.search.TopDocs} wrapper with
* an additional CharSequence key per {@link org.apache.lucene.search.ScoreDoc}
*
* @lucene.experimental
*/
public class TopSuggestDocs extends TopDocs {
/**
* Singleton for empty {@link TopSuggestDocs}
*/
public final static TopSuggestDocs EMPTY = new TopSuggestDocs(0, new SuggestScoreDoc[0], 0);
/**
* {@link org.apache.lucene.search.ScoreDoc} with an
* additional CharSequence key
*/
public static class SuggestScoreDoc extends ScoreDoc implements Comparable<SuggestScoreDoc> {
/**
* Matched completion key
*/
public CharSequence key;
/**
* Creates a SuggestScoreDoc instance
*
* @param doc document id (hit)
* @param key matched completion
* @param score weight of the matched completion
*/
public SuggestScoreDoc(int doc, CharSequence key, long score) {
// loss of precision but not magnitude
// implicit conversion from long -> float
super(doc, score);
this.key = key;
}
@Override
public int compareTo(SuggestScoreDoc o) {
return Lookup.CHARSEQUENCE_COMPARATOR.compare(key, o.key);
}
}
/**
* {@link org.apache.lucene.search.TopDocs} wrapper with
* {@link TopSuggestDocs.SuggestScoreDoc}
* instead of {@link org.apache.lucene.search.ScoreDoc}
*/
public TopSuggestDocs(int totalHits, SuggestScoreDoc[] scoreDocs, float maxScore) {
super(totalHits, scoreDocs, maxScore);
}
/**
* Returns {@link TopSuggestDocs.SuggestScoreDoc}s
* for this instance
*/
public SuggestScoreDoc[] scoreLookupDocs() {
return (SuggestScoreDoc[]) scoreDocs;
}
/**
* Returns a new TopSuggestDocs, containing topN results across
* the provided TopSuggestDocs, sorting by score. Each {@link TopSuggestDocs}
* instance must be sorted.
* Analogous to {@link org.apache.lucene.search.TopDocs#merge(int, org.apache.lucene.search.TopDocs[])}
* for {@link TopSuggestDocs}
*
* NOTE: assumes every <code>shardHit</code> is already sorted by score
*/
public static TopSuggestDocs merge(int topN, TopSuggestDocs[] shardHits) {
SuggestScoreDocPriorityQueue priorityQueue = new SuggestScoreDocPriorityQueue(topN);
for (TopSuggestDocs shardHit : shardHits) {
for (SuggestScoreDoc scoreDoc : shardHit.scoreLookupDocs()) {
if (scoreDoc == priorityQueue.insertWithOverflow(scoreDoc)) {
break;
}
}
}
SuggestScoreDoc[] topNResults = priorityQueue.getResults();
if (topNResults.length > 0) {
return new TopSuggestDocs(topNResults.length, topNResults, topNResults[0].score);
} else {
return TopSuggestDocs.EMPTY;
}
}
}

View File

@ -0,0 +1,118 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionTerminatedException;
import org.apache.lucene.search.SimpleCollector;
import static org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;
/**
* {@link org.apache.lucene.search.Collector} that collects completion and
* score, along with document id
* <p>
* Non scoring collector that collect completions in order of their
* pre-defined weight.
* <p>
* NOTE: One document can be collected multiple times if a document
* is matched for multiple unique completions for a given query
* <p>
* Subclasses should only override {@link TopSuggestDocsCollector#collect(int, CharSequence, long)},
* {@link #setScorer(org.apache.lucene.search.Scorer)} is not
* used
*
* @lucene.experimental
*/
public class TopSuggestDocsCollector extends SimpleCollector {
private final SuggestScoreDocPriorityQueue priorityQueue;
/**
* Document base offset for the current Leaf
*/
protected int docBase;
/**
* Sole constructor
*
* Collects at most <code>num</code> completions
* with corresponding document and weight
*/
public TopSuggestDocsCollector(int num) {
if (num <= 0) {
throw new IllegalArgumentException("'num' must be > 0");
}
this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
docBase = context.docBase;
}
/**
* Called for every matched completion,
* similar to {@link org.apache.lucene.search.LeafCollector#collect(int)}
* but for completions.
*
* NOTE: collection at the leaf level is guaranteed to be in
* descending order of score
*/
public void collect(int docID, CharSequence key, long score) throws IOException {
SuggestScoreDoc current = new SuggestScoreDoc(docBase + docID, key, score);
if (current == priorityQueue.insertWithOverflow(current)) {
// if the current SuggestScoreDoc has overflown from pq,
// we can assume all of the successive collections from
// this leaf will be overflown as well
// TODO: reuse the overflow instance?
throw new CollectionTerminatedException();
}
}
/**
* Returns at most <code>num</code> Top scoring {@link org.apache.lucene.search.suggest.document.TopSuggestDocs}s
*/
public TopSuggestDocs get() throws IOException {
SuggestScoreDoc[] suggestScoreDocs = priorityQueue.getResults();
if (suggestScoreDocs.length > 0) {
return new TopSuggestDocs(suggestScoreDocs.length, suggestScoreDocs, suggestScoreDocs[0].score);
} else {
return TopSuggestDocs.EMPTY;
}
}
/**
* Ignored
*/
@Override
public void collect(int doc) throws IOException {
// {@link #collect(int, CharSequence, long)} is used
// instead
}
/**
* Ignored
*/
@Override
public boolean needsScores() {
return false;
}
}

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Support for document suggestion
*/
package org.apache.lucene.search.suggest.document;

View File

@ -0,0 +1,33 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.search.suggest.document.Completion50PostingsFormat

View File

@ -0,0 +1,786 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.CyclicBarrier;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import static org.apache.lucene.search.suggest.document.TopSuggestDocs.*;
import static org.hamcrest.core.IsEqual.equalTo;
public class SuggestFieldTest extends LuceneTestCase {
public Directory dir;
@Before
public void before() throws Exception {
dir = newDirectory();
}
@After
public void after() throws Exception {
dir.close();
}
@Test
public void testSimple() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
Document document = new Document();
document.add(newSuggestField("suggest_field", "abc", 3l));
document.add(newSuggestField("suggest_field", "abd", 4l));
document.add(newSuggestField("suggest_field", "The Foo Fighters", 2l));
iw.addDocument(document);
document.clear();
document.add(newSuggestField("suggest_field", "abcdd", 5));
iw.addDocument(document);
if (rarely()) {
iw.commit();
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs lookupDocs = suggestIndexSearcher.suggest("suggest_field", "ab", 3);
assertSuggestions(lookupDocs, new Entry("abcdd", 5), new Entry("abd", 4), new Entry("abc", 3));
reader.close();
iw.close();
}
@Test
public void testMultipleSuggestFieldsPerDoc() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "sug_field_1", "sug_field_2"));
Document document = new Document();
document.add(newSuggestField("sug_field_1", "apple", 4));
document.add(newSuggestField("sug_field_2", "april", 3));
iw.addDocument(document);
document.clear();
document.add(newSuggestField("sug_field_1", "aples", 3));
document.add(newSuggestField("sug_field_2", "apartment", 2));
iw.addDocument(document);
if (rarely()) {
iw.commit();
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggestDocs1 = suggestIndexSearcher.suggest("sug_field_1", "ap", 4);
assertSuggestions(suggestDocs1, new Entry("apple", 4), new Entry("aples", 3));
TopSuggestDocs suggestDocs2 = suggestIndexSearcher.suggest("sug_field_2", "ap", 4);
assertSuggestions(suggestDocs2, new Entry("april", 3), new Entry("apartment", 2));
// check that the doc ids are consistent
for (int i = 0; i < suggestDocs1.scoreDocs.length; i++) {
ScoreDoc suggestScoreDoc = suggestDocs1.scoreDocs[i];
assertThat(suggestScoreDoc.doc, equalTo(suggestDocs2.scoreDocs[i].doc));
}
reader.close();
iw.close();
}
@Test
public void testDupSuggestFieldValues() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(300);
long[] weights = new long[num];
for(int i = 0; i < num; i++) {
Document document = new Document();
weights[i] = Math.abs(random().nextLong());
document.add(newSuggestField("suggest_field", "abc", weights[i]));
iw.addDocument(document);
}
if (rarely()) {
iw.commit();
}
DirectoryReader reader = iw.getReader();
Entry[] expectedEntries = new Entry[num];
Arrays.sort(weights);
for (int i = 1; i <= num; i++) {
expectedEntries[i - 1] = new Entry("abc", weights[num - i]);
}
SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs lookupDocs = suggestIndexSearcher.suggest("suggest_field", "abc", num);
assertSuggestions(lookupDocs, expectedEntries);
reader.close();
iw.close();
}
@Test
public void testNRTDeletedDocFiltering() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
// using IndexWriter instead of RandomIndexWriter
IndexWriter iw = new IndexWriter(dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
int numLive = 0;
List<Entry> expectedEntries = new ArrayList<>();
for (int i = 0; i < num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, num - i));
if (i % 2 == 0) {
document.add(newStringField("str_field", "delete", Field.Store.YES));
} else {
numLive++;
expectedEntries.add(new Entry("abc_" + i, num - i));
document.add(newStringField("str_field", "no_delete", Field.Store.YES));
}
iw.addDocument(document);
document.clear();
}
// get docIDs to delete
DirectoryReader reader = DirectoryReader.open(iw, false);
List<Integer> docIdsToDelete = new ArrayList<>();
for (int i = 0; i < reader.maxDoc(); i++) {
StoredDocument doc = reader.document(i);
if ("delete".equals(doc.get("str_field"))) {
docIdsToDelete.add(i);
}
}
for (Integer docID : docIdsToDelete) {
assertTrue(iw.tryDeleteDocument(reader, docID));
}
reader.close();
reader = DirectoryReader.open(iw, false);
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", numLive);
assertSuggestions(suggest, expectedEntries.toArray(new Entry[expectedEntries.size()]));
reader.close();
iw.close();
}
@Test
public void testSuggestOnAllFilteredDocuments() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
for (int i = 0; i < num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
document.add(newStringField("str_fld", "deleted", Field.Store.NO));
iw.addDocument(document);
document.clear();
}
Filter filter = new QueryWrapperFilter(new TermsQuery("str_fld", new BytesRef("non_existent")));
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
// no random access required;
// calling suggest with filter that does not match any documents should early terminate
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", num, filter);
assertThat(suggest.totalHits, equalTo(0));
reader.close();
iw.close();
}
@Test
public void testSuggestOnAllDeletedDocuments() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
// using IndexWriter instead of RandomIndexWriter
IndexWriter iw = new IndexWriter(dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
for (int i = 0; i < num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
iw.addDocument(document);
document.clear();
}
DirectoryReader reader = DirectoryReader.open(iw, false);
for (int docID = 0; docID < reader.maxDoc(); docID++) {
assertTrue(iw.tryDeleteDocument(reader, docID));
}
reader.close();
reader = DirectoryReader.open(iw, false);
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", num);
assertThat(suggest.totalHits, equalTo(0));
reader.close();
iw.close();
}
@Test
public void testSuggestOnMostlyDeletedDocuments() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
// using IndexWriter instead of RandomIndexWriter
IndexWriter iw = new IndexWriter(dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
for (int i = 1; i <= num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
document.add(new IntField("weight_fld", i, Field.Store.YES));
iw.addDocument(document);
document.clear();
}
DirectoryReader reader = DirectoryReader.open(iw, false);
// delete all but the lowest scored suggestion
for (int docID = 0; docID < reader.maxDoc(); docID++) {
StoredDocument doc = reader.document(docID);
StorableField[] weights = doc.getFields("weight_fld");
assertThat(weights.length, equalTo(1));
int weight = (int) weights[0].numericValue();
if (weight != 1) {
assertTrue(iw.tryDeleteDocument(reader, docID));
}
}
reader.close();
reader = DirectoryReader.open(iw, false);
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", 1);
assertSuggestions(suggest, new Entry("abc_1", 1));
reader.close();
iw.close();
}
@Test
public void testSuggestOnMostlyFilteredOutDocuments() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
for (int i = 0; i < num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
document.add(new IntField("filter_int_fld", i, Field.Store.NO));
iw.addDocument(document);
document.clear();
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
int topScore = num/2;
QueryWrapperFilter filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", 0, topScore, true, true));
Filter filter = randomAccessFilter(filterWrapper);
// if at most half of the top scoring documents have been filtered out
// the search should be admissible
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", 1, filter);
assertSuggestions(suggest, new Entry("abc_" + topScore, topScore));
filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", 0, 0, true, true));
filter = randomAccessFilter(filterWrapper);
// if more than half of the top scoring documents have been filtered out
// search is not admissible, so # of suggestions requested is num instead of 1
suggest = indexSearcher.suggest("suggest_field", "abc_", num, filter);
assertSuggestions(suggest, new Entry("abc_0", 0));
filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", num - 1, num - 1, true, true));
filter = randomAccessFilter(filterWrapper);
// if only lower scoring documents are filtered out
// search is admissible
suggest = indexSearcher.suggest("suggest_field", "abc_", 1, filter);
assertSuggestions(suggest, new Entry("abc_" + (num - 1), num - 1));
reader.close();
iw.close();
}
@Test
public void testEarlyTermination() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
// have segments of 4 documents
// with descending suggestion weights
// suggest should early terminate for
// segments with docs having lower suggestion weights
for (int i = num; i > 0; i--) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
iw.addDocument(document);
document.clear();
if (i % 4 == 0) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", 1);
assertSuggestions(suggest, new Entry("abc_" + num, num));
reader.close();
iw.close();
}
@Test
public void testMultipleSegments() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
List<Entry> entries = new ArrayList<>();
// ensure at least some segments have no suggest field
for (int i = num; i > 0; i--) {
if (random().nextInt(4) == 1) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
entries.add(new Entry("abc_" + i, i));
}
document.add(new IntField("weight_fld", i, Field.Store.YES));
iw.addDocument(document);
document.clear();
if (usually()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", (entries.size() == 0) ? 1 : entries.size());
assertSuggestions(suggest, entries.toArray(new Entry[entries.size()]));
reader.close();
iw.close();
}
@Test
public void testDocFiltering() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
Document document = new Document();
document.add(new IntField("filter_int_fld", 9, Field.Store.NO));
document.add(newSuggestField("suggest_field", "apples", 3));
iw.addDocument(document);
document.clear();
document.add(new IntField("filter_int_fld", 10, Field.Store.NO));
document.add(newSuggestField("suggest_field", "applle", 4));
iw.addDocument(document);
document.clear();
document.add(new IntField("filter_int_fld", 4, Field.Store.NO));
document.add(newSuggestField("suggest_field", "apple", 5));
iw.addDocument(document);
if (rarely()) {
iw.commit();
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
// suggest without filter
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "app", 3);
assertSuggestions(suggest, new Entry("apple", 5), new Entry("applle", 4), new Entry("apples", 3));
// suggest with filter
QueryWrapperFilter filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", 5, 12, true, true));
Filter filter = randomAccessFilter(filterWrapper);
suggest = indexSearcher.suggest("suggest_field", "app", 3, filter);
assertSuggestions(suggest, new Entry("applle", 4), new Entry("apples", 3));
reader.close();
iw.close();
}
@Test
public void testReturnedDocID() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
Document document = new Document();
int num = atLeast(10);
for (int i = 0; i < num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, num));
document.add(new IntField("int_field", i, Field.Store.YES));
iw.addDocument(document);
document.clear();
if (random().nextBoolean()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", num);
assertEquals(num, suggest.totalHits);
for (SuggestScoreDoc suggestScoreDoc : suggest.scoreLookupDocs()) {
String key = suggestScoreDoc.key.toString();
assertTrue(key.startsWith("abc_"));
String substring = key.substring(4);
int fieldValue = Integer.parseInt(substring);
StoredDocument doc = reader.document(suggestScoreDoc.doc);
assertEquals(doc.getField("int_field").numericValue().intValue(), fieldValue);
}
reader.close();
iw.close();
}
@Test
public void testCompletionAnalyzerOptions() throws Exception {
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
Map<String, Analyzer> map = new HashMap<>();
map.put("suggest_field_default", new CompletionAnalyzer(analyzer));
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, true);
map.put("suggest_field_no_p_sep", completionAnalyzer);
completionAnalyzer = new CompletionAnalyzer(analyzer, true, false);
map.put("suggest_field_no_p_pos_inc", completionAnalyzer);
completionAnalyzer = new CompletionAnalyzer(analyzer, false, false);
map.put("suggest_field_no_p_sep_or_pos_inc", completionAnalyzer);
PerFieldAnalyzerWrapper analyzerWrapper = new PerFieldAnalyzerWrapper(analyzer, map);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzerWrapper, map.keySet()));
Document document = new Document();
document.add(newSuggestField("suggest_field_default", "foobar", 7));
document.add(newSuggestField("suggest_field_default", "foo bar", 8));
document.add(newSuggestField("suggest_field_default", "the fo", 9));
document.add(newSuggestField("suggest_field_default", "the foo bar", 10));
document.add(newSuggestField("suggest_field_no_p_sep", "foobar", 7));
document.add(newSuggestField("suggest_field_no_p_sep", "foo bar", 8));
document.add(newSuggestField("suggest_field_no_p_sep", "the fo", 9));
document.add(newSuggestField("suggest_field_no_p_sep", "the foo bar", 10));
document.add(newSuggestField("suggest_field_no_p_pos_inc", "foobar", 7));
document.add(newSuggestField("suggest_field_no_p_pos_inc", "foo bar", 8));
document.add(newSuggestField("suggest_field_no_p_pos_inc", "the fo", 9));
document.add(newSuggestField("suggest_field_no_p_pos_inc", "the foo bar", 10));
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "foobar", 7));
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "foo bar", 8));
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "the fo", 9));
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "the foo bar", 10));
iw.addDocument(document);
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest;
suggest = indexSearcher.suggest("suggest_field_default", "fo", 4);
assertSuggestions(suggest, new Entry("foo bar", 8), new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_default", "foob", 4);
assertSuggestions(suggest, new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_sep", "fo", 4); // matches all 4
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_sep", "foob", 4); // except the fo
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_pos_inc", "fo", 4); //matches all 4
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_pos_inc", "foob", 4); // only foobar
assertSuggestions(suggest, new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_sep_or_pos_inc", "fo", 4); // all 4
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_sep_or_pos_inc", "foob", 4); // not the fo
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
reader.close();
iw.close();
}
@Test
public void testScoring() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(100);
String[] prefixes = {"abc", "bac", "cab"};
Map<String, Long> mappings = new HashMap<>();
for (int i = 0; i < num; i++) {
Document document = new Document();
String suggest = prefixes[i % 3] + TestUtil.randomSimpleString(random(), 10) + "_" +String.valueOf(i);
long weight = Math.abs(random().nextLong());
document.add(newSuggestField("suggest_field", suggest, weight));
mappings.put(suggest, weight);
iw.addDocument(document);
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
for (String prefix : prefixes) {
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", prefix, num);
assertTrue(suggest.totalHits > 0);
float topScore = -1;
for (SuggestScoreDoc scoreDoc : suggest.scoreLookupDocs()) {
if (topScore != -1) {
assertTrue(topScore >= scoreDoc.score);
}
topScore = scoreDoc.score;
assertThat((float) mappings.get(scoreDoc.key.toString()), equalTo(scoreDoc.score));
assertNotNull(mappings.remove(scoreDoc.key.toString()));
}
}
assertThat(mappings.size(), equalTo(0));
reader.close();
iw.close();
}
@Test
public void testRealisticKeys() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
LineFileDocs lineFileDocs = new LineFileDocs(random());
int num = atLeast(100);
Map<String, Long> mappings = new HashMap<>();
for (int i = 0; i < num; i++) {
Document document = lineFileDocs.nextDoc();
String title = document.getField("title").stringValue();
long weight = Math.abs(random().nextLong());
Long prevWeight = mappings.get(title);
if (prevWeight == null || prevWeight < weight) {
mappings.put(title, weight);
}
Document doc = new Document();
doc.add(newSuggestField("suggest_field", title, weight));
iw.addDocument(doc);
if (rarely()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
for (Map.Entry<String, Long> entry : mappings.entrySet()) {
String title = entry.getKey();
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", title, mappings.size());
assertTrue(suggest.totalHits > 0);
boolean matched = false;
for (ScoreDoc scoreDoc : suggest.scoreDocs) {
matched = Float.compare(scoreDoc.score, (float) entry.getValue()) == 0;
if (matched) {
break;
}
}
assertTrue("at least one of the entries should have the score", matched);
}
reader.close();
iw.close();
}
@Test
public void testThreads() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field_1", "suggest_field_2", "suggest_field_3"));
int num = atLeast(100);
final String prefix1 = "abc1_";
final String prefix2 = "abc2_";
final String prefix3 = "abc3_";
final Entry[] entries1 = new Entry[num];
final Entry[] entries2 = new Entry[num];
final Entry[] entries3 = new Entry[num];
for (int i = 0; i < num; i++) {
int weight = num - (i + 1);
entries1[i] = new Entry(prefix1 + weight, weight);
entries2[i] = new Entry(prefix2 + weight, weight);
entries3[i] = new Entry(prefix3 + weight, weight);
}
for (int i = 0; i < num; i++) {
Document doc = new Document();
doc.add(newSuggestField("suggest_field_1", prefix1 + i, i));
doc.add(newSuggestField("suggest_field_2", prefix2 + i, i));
doc.add(newSuggestField("suggest_field_3", prefix3 + i, i));
iw.addDocument(doc);
if (rarely()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
int numThreads = TestUtil.nextInt(random(), 2, 7);
Thread threads[] = new Thread[numThreads];
final CyclicBarrier startingGun = new CyclicBarrier(numThreads+1);
final CopyOnWriteArrayList<Throwable> errors = new CopyOnWriteArrayList<>();
final SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
for (int i = 0; i < threads.length; i++) {
threads[i] = new Thread() {
@Override
public void run() {
try {
startingGun.await();
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field_1", prefix1, num);
assertSuggestions(suggest, entries1);
suggest = indexSearcher.suggest("suggest_field_2", prefix2, num);
assertSuggestions(suggest, entries2);
suggest = indexSearcher.suggest("suggest_field_3", prefix3, num);
assertSuggestions(suggest, entries3);
} catch (Throwable e) {
errors.add(e);
}
}
};
threads[i].start();
}
startingGun.await();
for (Thread t : threads) {
t.join();
}
assertTrue(errors.toString(), errors.isEmpty());
reader.close();
iw.close();
}
private static Filter randomAccessFilter(Filter filter) {
return new Filter() {
@Override
public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException {
DocIdSet docIdSet = filter.getDocIdSet(context, acceptDocs);
DocIdSetIterator iterator = docIdSet.iterator();
FixedBitSet bits = new FixedBitSet(context.reader().maxDoc());
if (iterator != null) {
int doc;
while((doc = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
bits.set(doc);
}
}
return new DocIdSet() {
@Override
public DocIdSetIterator iterator() throws IOException {
return iterator;
}
@Override
public Bits bits() throws IOException {
return bits;
}
@Override
public long ramBytesUsed() {
return docIdSet.ramBytesUsed();
}
};
}
@Override
public String toString(String field) {
return filter.toString(field);
}
};
}
private static class Entry {
private final String output;
private final float value;
private Entry(String output, float value) {
this.output = output;
this.value = value;
}
}
private void assertSuggestions(TopDocs actual, Entry... expected) {
SuggestScoreDoc[] suggestScoreDocs = (SuggestScoreDoc[]) actual.scoreDocs;
assertThat(suggestScoreDocs.length, equalTo(expected.length));
for (int i = 0; i < suggestScoreDocs.length; i++) {
SuggestScoreDoc lookupDoc = suggestScoreDocs[i];
assertThat(lookupDoc.key.toString(), equalTo(expected[i].output));
assertThat(lookupDoc.score, equalTo(expected[i].value));
}
}
private SuggestField newSuggestField(String name, String value, long weight) throws IOException {
return new SuggestField(name, value, weight);
}
private IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, String... suggestFields) {
return iwcWithSuggestField(analyzer, asSet(suggestFields));
}
private IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, Set<String> suggestFields) {
IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
iwc.setMergePolicy(newLogMergePolicy());
Codec filterCodec = new Lucene50Codec() {
PostingsFormat postingsFormat = new Completion50PostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if (suggestFields.contains(field)) {
return postingsFormat;
}
return super.getPostingsFormatForField(field);
}
};
iwc.setCodec(filterCodec);
return iwc;
}
}