LUCENE-6339: Added Near-real time Document Suggester via custom postings format

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1669698 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Areek Zillur 2015-03-27 22:37:49 +00:00
parent f49f2b2af6
commit b7adb6ac77
18 changed files with 3005 additions and 1 deletions

View File

@ -19,6 +19,9 @@ New Features
for counting ranges that align with the underlying terms as defined by the
NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley)
* LUCENE-6339: Added Near-real time Document Suggester via custom postings format
(Areek Zillur, Mike McCandless, Simon Willnauer)
API Changes
* LUCENE-3312: The API of oal.document was restructured to

View File

@ -49,7 +49,7 @@ public class FSTUtil {
public final FST.Arc<T> fstNode;
/** Output of the path so far: */
T output;
public final T output;
/** Input of the path so far: */
public final IntsRefBuilder input;

View File

@ -0,0 +1,42 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.codecs.PostingsFormat;
/**
* {@link org.apache.lucene.search.suggest.document.CompletionPostingsFormat}
* for {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat}
*
* @lucene.experimental
*/
public class Completion50PostingsFormat extends CompletionPostingsFormat {
/**
* Sole Constructor
*/
public Completion50PostingsFormat() {
super();
}
@Override
protected PostingsFormat delegatePostingsFormat() {
return PostingsFormat.forName("Lucene50");
}
}

View File

@ -0,0 +1,173 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
/**
* Wraps an {@link org.apache.lucene.analysis.Analyzer}
* to provide additional completion-only tuning
* (e.g. preserving token separators, preserving position increments while converting
* a token stream to an automaton)
* <p>
* Can be used to index {@link SuggestField}
* and as a query analyzer to {@link SuggestIndexSearcher}
* <p>
* NOTE: In most cases, index and query analyzer should have same values for {@link #preservePositionIncrements}
* and {@link #preserveSep}
*
* @lucene.experimental
*/
public class CompletionAnalyzer extends AnalyzerWrapper {
/**
* Represents the separation between tokens, if
* <code>preserveSep</code> is <code>true</code>
* <p>
* Same label is used as a delimiter in the {@link org.apache.lucene.search.suggest.document.CompletionTokenStream}
* payload
*/
final static int SEP_LABEL = NRTSuggesterBuilder.PAYLOAD_SEP;
/**
* Represent a hole character, inserted by {@link org.apache.lucene.analysis.TokenStreamToAutomaton}
*/
final static int HOLE_CHARACTER = TokenStreamToAutomaton.HOLE;
final static int DEFAULT_MAX_GRAPH_EXPANSIONS = -1;
final static boolean DEFAULT_PRESERVE_SEP = true;
final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
private final Analyzer analyzer;
/**
* Preserve separation between tokens
* when converting to an automaton
* <p>
* Defaults to <code>true</code>
*/
private final boolean preserveSep;
/**
* Preserve position increments for tokens
* when converting to an automaton
* <p>
* Defaults to <code>true</code>
*/
private final boolean preservePositionIncrements;
/**
* Sets the maximum number of graph expansions of a completion automaton
* <p>
* Defaults to <code>-1</code> (no limit)
*/
private final int maxGraphExpansions;
/**
* Wraps an analyzer to convert it's output token stream to an automaton
*
* @param analyzer token stream to be converted to an automaton
* @param preserveSep Preserve separation between tokens when converting to an automaton
* @param preservePositionIncrements Preserve position increments for tokens when converting to an automaton
* @param maxGraphExpansions Sets the maximum number of graph expansions of a completion automaton
*/
public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
super(PER_FIELD_REUSE_STRATEGY);
this.analyzer = analyzer;
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.maxGraphExpansions = maxGraphExpansions;
}
/**
* Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
* preserving token separation, position increments and no limit on graph expansions
*/
public CompletionAnalyzer(Analyzer analyzer) {
this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
}
/**
* Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
* with no limit on graph expansions
*/
public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements) {
this(analyzer, preserveSep, preservePositionIncrements, DEFAULT_MAX_GRAPH_EXPANSIONS);
}
/**
* Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
* preserving token separation and position increments
*/
public CompletionAnalyzer(Analyzer analyzer, int maxGraphExpansions) {
this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions);
}
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return analyzer;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
CompletionTokenStream tokenStream = new CompletionTokenStream(components.getTokenStream(),
preserveSep, preservePositionIncrements, SEP_LABEL, maxGraphExpansions);
return new TokenStreamComponents(components.getTokenizer(), tokenStream);
}
/**
* Converts <code>key</code> to an automaton using
* {@link #preservePositionIncrements}, {@link #preserveSep}
* and {@link #maxGraphExpansions}
*/
public Automaton toAutomaton(String field, CharSequence key) throws IOException {
for (int i = 0; i < key.length(); i++) {
switch (key.charAt(i)) {
case HOLE_CHARACTER:
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
case SEP_LABEL:
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
default:
break;
}
}
try (TokenStream tokenStream = analyzer.tokenStream(field, key.toString())) {
try(CompletionTokenStream stream = new CompletionTokenStream(tokenStream,
preserveSep, preservePositionIncrements, SEP_LABEL, maxGraphExpansions)) {
return stream.toAutomaton(tokenStream);
}
}
}
}

View File

@ -0,0 +1,192 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.CODEC_NAME;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
/**
* <p>
* Weighted FSTs for any indexed {@link SuggestField} is built on {@link #write(Fields)}.
* A weighted FST maps the analyzed forms of a field to its
* surface form and document id. FSTs are stored in the CompletionDictionary (.lkp).
* </p>
* <p>
* The file offsets of a field's FST are stored in the CompletionIndex (.cmp)
* along with the field's internal number {@link FieldInfo#number} on {@link #close()}.
* </p>
*
*/
final class CompletionFieldsConsumer extends FieldsConsumer {
private final String delegatePostingsFormatName;
private final Map<String, Long> seenFields = new HashMap<>();
private final SegmentWriteState state;
private IndexOutput dictOut;
private FieldsConsumer delegateFieldsConsumer;
CompletionFieldsConsumer(PostingsFormat delegatePostingsFormat, SegmentWriteState state) throws IOException {
this.delegatePostingsFormatName = delegatePostingsFormat.getName();
this.state = state;
String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
boolean success = false;
try {
this.delegateFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
dictOut = state.directory.createOutput(dictFile, state.context);
CodecUtil.writeIndexHeader(dictOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
}
}
}
@Override
public void write(Fields fields) throws IOException {
delegateFieldsConsumer.write(fields);
for (String field : fields) {
CompletionTermWriter termWriter = new CompletionTermWriter();
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator(null);
// write terms
BytesRef term;
while ((term = termsEnum.next()) != null) {
termWriter.write(term, termsEnum);
}
// store lookup, if needed
long filePointer = dictOut.getFilePointer();
if (termWriter.finish(dictOut)) {
seenFields.put(field, filePointer);
}
}
}
private boolean closed = false;
@Override
public void close() throws IOException {
if (closed) {
return;
}
closed = true;
String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
boolean success = false;
try (IndexOutput indexOut = state.directory.createOutput(indexFile, state.context)) {
delegateFieldsConsumer.close();
CodecUtil.writeIndexHeader(indexOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
/*
* we write the delegate postings format name so we can load it
* without getting an instance in the ctor
*/
indexOut.writeString(delegatePostingsFormatName);
// write # of seen fields
indexOut.writeVInt(seenFields.size());
// write field numbers and dictOut offsets
for (Map.Entry<String, Long> seenField : seenFields.entrySet()) {
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(seenField.getKey());
indexOut.writeVInt(fieldInfo.number);
indexOut.writeVLong(seenField.getValue());
}
CodecUtil.writeFooter(indexOut);
CodecUtil.writeFooter(dictOut);
IOUtils.close(dictOut);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
}
}
}
// builds an FST based on the terms written
private static class CompletionTermWriter {
private PostingsEnum postingsEnum = null;
private int docCount = 0;
private final BytesRefBuilder scratch = new BytesRefBuilder();
private final NRTSuggesterBuilder builder;
public CompletionTermWriter() {
builder = new NRTSuggesterBuilder();
}
/**
* Stores the built FST in <code>output</code>
* Returns true if there was anything stored, false otherwise
*/
public boolean finish(IndexOutput output) throws IOException {
boolean stored = builder.store(output);
assert stored || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
return stored;
}
/**
* Writes all postings (surface form, weight, document id) for <code>term</code>
*/
public void write(BytesRef term, TermsEnum termsEnum) throws IOException {
postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.PAYLOADS);
builder.startTerm(term);
int docFreq = 0;
while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int docID = postingsEnum.docID();
for (int i = 0; i < postingsEnum.freq(); i++) {
postingsEnum.nextPosition();
assert postingsEnum.getPayload() != null;
BytesRef payload = postingsEnum.getPayload();
ByteArrayDataInput input = new ByteArrayDataInput(payload.bytes, payload.offset, payload.length);
int len = input.readVInt();
scratch.grow(len);
scratch.setLength(len);
input.readBytes(scratch.bytes(), 0, scratch.length());
builder.addEntry(docID, scratch.get(), input.readVLong() - 1);
}
docFreq++;
docCount = Math.max(docCount, docFreq + 1);
}
builder.finishTerm();
}
}
}

View File

@ -0,0 +1,228 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.IOUtils;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.CODEC_NAME;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_CODEC_VERSION;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
/**
* <p>
* Completion index (.cmp) is opened and read at instantiation to read in {@link SuggestField}
* numbers and their FST offsets in the Completion dictionary (.lkp).
* </p>
* <p>
* Completion dictionary (.lkp) is opened at instantiation and a field's FST is loaded
* into memory the first time it is requested via {@link #terms(String)}.
* </p>
* <p>
* NOTE: Only the footer is validated for Completion dictionary (.lkp) and not the checksum due
* to random access pattern and checksum validation being too costly at instantiation
* </p>
*
*/
final class CompletionFieldsProducer extends FieldsProducer {
private FieldsProducer delegateFieldsProducer;
private Map<String, CompletionsTermsReader> readers;
private IndexInput dictIn;
// copy ctr for merge instance
private CompletionFieldsProducer(FieldsProducer delegateFieldsProducer, Map<String, CompletionsTermsReader> readers) {
this.delegateFieldsProducer = delegateFieldsProducer;
this.readers = readers;
}
CompletionFieldsProducer(SegmentReadState state) throws IOException {
String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
delegateFieldsProducer = null;
boolean success = false;
try (ChecksumIndexInput index = state.directory.openChecksumInput(indexFile, state.context)) {
// open up dict file containing all fsts
String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
dictIn = state.directory.openInput(dictFile, state.context);
CodecUtil.checkIndexHeader(dictIn, CODEC_NAME, COMPLETION_CODEC_VERSION, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
// just validate the footer for the dictIn
CodecUtil.retrieveChecksum(dictIn);
// open up index file (fieldNumber, offset)
CodecUtil.checkIndexHeader(index, CODEC_NAME, COMPLETION_CODEC_VERSION, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
// load delegate PF
PostingsFormat delegatePostingsFormat = PostingsFormat.forName(index.readString());
delegateFieldsProducer = delegatePostingsFormat.fieldsProducer(state);
// read suggest field numbers and their offsets in the terms file from index
int numFields = index.readVInt();
readers = new HashMap<>(numFields);
for (int i = 0; i < numFields; i++) {
int fieldNumber = index.readVInt();
long offset = index.readVLong();
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNumber);
// we don't load the FST yet
readers.put(fieldInfo.name, new CompletionsTermsReader(offset));
}
CodecUtil.checkFooter(index);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(delegateFieldsProducer, dictIn);
}
}
}
@Override
public void close() throws IOException {
boolean success = false;
try {
delegateFieldsProducer.close();
IOUtils.close(dictIn);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(delegateFieldsProducer, dictIn);
}
}
}
@Override
public void checkIntegrity() throws IOException {
delegateFieldsProducer.checkIntegrity();
// TODO: checkIntegrity should checksum the dictionary and index
}
@Override
public FieldsProducer getMergeInstance() throws IOException {
return new CompletionFieldsProducer(delegateFieldsProducer, readers);
}
@Override
public long ramBytesUsed() {
long ramBytesUsed = delegateFieldsProducer.ramBytesUsed();
for (CompletionsTermsReader reader : readers.values()) {
ramBytesUsed += reader.ramBytesUsed();
}
return ramBytesUsed;
}
@Override
public Collection<Accountable> getChildResources() {
List<Accountable> accountableList = new ArrayList<>();
for (Map.Entry<String, CompletionsTermsReader> readerEntry : readers.entrySet()) {
accountableList.add(Accountables.namedAccountable(readerEntry.getKey(), readerEntry.getValue()));
}
return Collections.unmodifiableCollection(accountableList);
}
@Override
public Iterator<String> iterator() {
return readers.keySet().iterator();
}
@Override
public Terms terms(String field) throws IOException {
return new CompletionTerms(delegateFieldsProducer.terms(field), readers.get(field));
}
@Override
public int size() {
return readers.size();
}
private class CompletionsTermsReader implements Accountable {
private final long offset;
private NRTSuggester suggester;
public CompletionsTermsReader(long offset) throws IOException {
assert offset >= 0l && offset < dictIn.length();
this.offset = offset;
}
public synchronized NRTSuggester suggester() throws IOException {
if (suggester == null) {
try (IndexInput dictClone = dictIn.clone()) { // let multiple fields load concurrently
dictClone.seek(offset);
suggester = NRTSuggester.load(dictClone);
}
}
return suggester;
}
@Override
public long ramBytesUsed() {
return (suggester != null) ? suggester.ramBytesUsed() : 0;
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.emptyList();
}
}
/**
* Thin wrapper over {@link org.apache.lucene.index.Terms} with
* a {@link NRTSuggester}
*/
public static class CompletionTerms extends FilterLeafReader.FilterTerms {
private final CompletionsTermsReader reader;
public CompletionTerms(Terms in, CompletionsTermsReader reader) {
super(in);
this.reader = reader;
}
/**
* Returns a {@link NRTSuggester} for the field
* or <code>null</code> if no FST
* was indexed for this field
*/
public NRTSuggester suggester() throws IOException {
if (reader == null) {
return null;
}
return reader.suggester();
}
}
}

View File

@ -0,0 +1,121 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.fst.FST;
/**
* <p>
* A {@link PostingsFormat} which supports document suggestion based on
* indexed {@link SuggestField}s.
* Document suggestion is based on an weighted FST which map analyzed
* terms of a {@link SuggestField} to its surface form and document id.
* </p>
* <p>
* Files:
* <ul>
* <li><tt>.lkp</tt>: <a href="#Completiondictionary">Completion Dictionary</a></li>
* <li><tt>.cmp</tt>: <a href="#Completionindex">Completion Index</a></li>
* </ul>
* <p>
* <a name="Completionictionary"></a>
* <h3>Completion Dictionary</h3>
* <p>The .lkp file contains an FST for each suggest field
* </p>
* <ul>
* <li>CompletionDict (.lkp) --&gt; Header, FST<sup>NumSuggestFields</sup>, Footer</li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <!-- TODO: should the FST output be mentioned at all? -->
* <li>FST --&gt; {@link FST FST&lt;Long, BytesRef&gt;}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* <p>Notes:</p>
* <ul>
* <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
* for the Completion implementation.</li>
* <li>FST maps all analyzed forms to surface forms of a SuggestField</li>
* </ul>
* <a name="Completionindex"></a>
* <h3>Completion Index</h3>
* <p>The .cmp file contains an index into the completion dictionary, so that it can be
* accessed randomly.</p>
* <ul>
* <li>CompletionIndex (.cmp) --&gt; Header, NumSuggestFields, Entry<sup>NumSuggestFields</sup>, Footer</li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>NumSuggestFields --&gt; {@link DataOutput#writeVInt Uint32}</li>
* <li>Entry --&gt; FieldNumber, CompletionDictionaryOffset</li>
* <li>FieldNumber --&gt; {@link DataOutput#writeVInt Uint32}</li>
* <li>CompletionDictionaryOffset --&gt; {@link DataOutput#writeVLong Uint64}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* <p>Notes:</p>
* <ul>
* <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
* for the Completion implementation.</li>
* <li>NumSuggestFields is the number of suggest fields indexed</li>
* <li>FieldNumber is the fields number from {@link FieldInfos}. (.fnm)</li>
* <li>CompletionDictionaryOffset is the file offset of a field's FST in CompletionDictionary (.lkp)</li>
* </ul>
*
* @lucene.experimental
*/
public abstract class CompletionPostingsFormat extends PostingsFormat {
static final String CODEC_NAME = "completion";
static final int COMPLETION_CODEC_VERSION = 1;
static final int COMPLETION_VERSION_CURRENT = COMPLETION_CODEC_VERSION;
static final String INDEX_EXTENSION = "cmp";
static final String DICT_EXTENSION = "lkp";
/**
* Used only by core Lucene at read-time via Service Provider instantiation
*/
public CompletionPostingsFormat() {
super(CODEC_NAME);
}
/**
* Concrete implementation should specify the delegating postings format
*/
protected abstract PostingsFormat delegatePostingsFormat();
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsFormat delegatePostingsFormat = delegatePostingsFormat();
if (delegatePostingsFormat == null) {
throw new UnsupportedOperationException("Error - " + getClass().getName()
+ " has been constructed without a choice of PostingsFormat");
}
return new CompletionFieldsConsumer(delegatePostingsFormat, state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
return new CompletionFieldsProducer(state);
}
}

View File

@ -0,0 +1,358 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Util;
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS;
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS;
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP;
import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL;
/**
* Token stream which converts a provided token stream to an automaton.
* The accepted strings enumeration from the automaton are available through the
* {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
* The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
* a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)})
*
*/
final class CompletionTokenStream extends TokenStream {
private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);
private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
private final TokenStream input;
private final boolean preserveSep;
private final boolean preservePositionIncrements;
private final int sepLabel;
private final int maxGraphExpansions;
private BytesRef payload;
private Iterator<IntsRef> finiteStrings;
private int posInc = -1;
private CharTermAttribute charTermAttribute;
/**
* Creates a token stream to convert <code>input</code> to a token stream
* of accepted strings by its automaton.
* <p>
* The token stream <code>input</code> is converted to an automaton
* with the default settings of {@link org.apache.lucene.search.suggest.document.CompletionAnalyzer}
*/
public CompletionTokenStream(TokenStream input) {
this(input, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, SEP_LABEL, DEFAULT_MAX_GRAPH_EXPANSIONS);
}
CompletionTokenStream(TokenStream input, boolean preserveSep, boolean preservePositionIncrements, int sepLabel, int maxGraphExpansions) {
// Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
// the input stream entirely in toFiniteStrings(input)
this.input = input;
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.sepLabel = sepLabel;
this.maxGraphExpansions = maxGraphExpansions;
}
/**
* Returns a separator label that is reserved for the payload
* in {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)}
*/
public int sepLabel() {
return sepLabel;
}
/**
* Sets a payload available throughout successive token stream enumeration
*/
public void setPayload(BytesRef payload) {
this.payload = payload;
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
if (finiteStrings == null) {
//TODO: make this return a Iterator<IntsRef> instead?
Automaton automaton = toAutomaton(input);
Set<IntsRef> strings = Operations.getFiniteStrings(automaton, maxGraphExpansions);
posInc = strings.size();
finiteStrings = strings.iterator();
}
if (finiteStrings.hasNext()) {
posAttr.setPositionIncrement(posInc);
/*
* this posInc encodes the number of paths that this surface form
* produced. Multi Fields have the same surface form and therefore sum up
*/
posInc = 0;
Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8
if (charTermAttribute != null) {
charTermAttribute.setLength(0);
charTermAttribute.append(bytesAtt.toUTF16());
}
if (payload != null) {
payloadAttr.setPayload(this.payload);
}
return true;
}
return false;
}
@Override
public void end() throws IOException {
super.end();
if (posInc == -1) {
input.end();
}
}
@Override
public void close() throws IOException {
if (posInc == -1) {
input.close();
}
}
@Override
public void reset() throws IOException {
super.reset();
if (hasAttribute(CharTermAttribute.class)) {
// we only create this if we really need it to safe the UTF-8 to UTF-16 conversion
charTermAttribute = getAttribute(CharTermAttribute.class);
}
finiteStrings = null;
posInc = -1;
}
/**
* Converts <code>tokenStream</code> to an automaton
*/
public Automaton toAutomaton(TokenStream tokenStream) throws IOException {
// TODO refactor this
// maybe we could hook up a modified automaton from TermAutomatonQuery here?
Automaton automaton = null;
try {
// Create corresponding automaton: labels are bytes
// from each analyzed token, with byte 0 used as
// separator between tokens:
final TokenStreamToAutomaton tsta;
if (preserveSep) {
tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL);
} else {
// When we're not preserving sep, we don't steal 0xff
// byte, so we don't need to do any escaping:
tsta = new TokenStreamToAutomaton();
}
tsta.setPreservePositionIncrements(preservePositionIncrements);
automaton = tsta.toAutomaton(tokenStream);
} finally {
IOUtils.closeWhileHandlingException(tokenStream);
}
// TODO: we can optimize this somewhat by determinizing
// while we convert
automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
// This automaton should not blow up during determinize:
return Operations.determinize(automaton, maxGraphExpansions);
}
/**
* Just escapes the 0xff byte (which we still for SEP).
*/
private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
final BytesRefBuilder spare = new BytesRefBuilder();
private char sepLabel;
public EscapingTokenStreamToAutomaton(char sepLabel) {
this.sepLabel = sepLabel;
}
@Override
protected BytesRef changeToken(BytesRef in) {
int upto = 0;
for (int i = 0; i < in.length; i++) {
byte b = in.bytes[in.offset + i];
if (b == (byte) sepLabel) {
spare.grow(upto + 2);
spare.setByteAt(upto++, (byte) sepLabel);
spare.setByteAt(upto++, b);
} else {
spare.grow(upto + 1);
spare.setByteAt(upto++, b);
}
}
spare.setLength(upto);
return spare.get();
}
}
// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
Automaton result = new Automaton();
// Copy all states over
int numStates = a.getNumStates();
for (int s = 0; s < numStates; s++) {
result.createState();
result.setAccept(s, a.isAccept(s));
}
// Go in reverse topo sort so we know we only have to
// make one pass:
Transition t = new Transition();
int[] topoSortStates = topoSortStates(a);
for (int i = 0; i < topoSortStates.length; i++) {
int state = topoSortStates[topoSortStates.length - 1 - i];
int count = a.initTransition(state, t);
for (int j = 0; j < count; j++) {
a.getNextTransition(t);
if (t.min == TokenStreamToAutomaton.POS_SEP) {
assert t.max == TokenStreamToAutomaton.POS_SEP;
if (preserveSep) {
// Remap to SEP_LABEL:
result.addTransition(state, t.dest, sepLabel);
} else {
result.addEpsilon(state, t.dest);
}
} else if (t.min == TokenStreamToAutomaton.HOLE) {
assert t.max == TokenStreamToAutomaton.HOLE;
// Just remove the hole: there will then be two
// SEP tokens next to each other, which will only
// match another hole at search time. Note that
// it will also match an empty-string token ... if
// that's somehow a problem we can always map HOLE
// to a dedicated byte (and escape it in the
// input).
result.addEpsilon(state, t.dest);
} else {
result.addTransition(state, t.dest, t.min, t.max);
}
}
}
result.finishState();
return result;
}
private static int[] topoSortStates(Automaton a) {
int[] states = new int[a.getNumStates()];
final Set<Integer> visited = new HashSet<>();
final LinkedList<Integer> worklist = new LinkedList<>();
worklist.add(0);
visited.add(0);
int upto = 0;
states[upto] = 0;
upto++;
Transition t = new Transition();
while (worklist.size() > 0) {
int s = worklist.removeFirst();
int count = a.initTransition(s, t);
for (int i = 0; i < count; i++) {
a.getNextTransition(t);
if (!visited.contains(t.dest)) {
visited.add(t.dest);
worklist.add(t.dest);
states[upto++] = t.dest;
}
}
}
return states;
}
public interface ByteTermAttribute extends TermToBytesRefAttribute {
// marker interface
/**
* Return the builder from which the term is derived.
*/
public BytesRefBuilder builder();
public CharSequence toUTF16();
}
public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute {
private final BytesRefBuilder bytes = new BytesRefBuilder();
private CharsRefBuilder charsRef;
@Override
public void fillBytesRef() {
// does nothing - we change in place
}
@Override
public BytesRefBuilder builder() {
return bytes;
}
@Override
public BytesRef getBytesRef() {
return bytes.get();
}
@Override
public void clear() {
bytes.clear();
}
@Override
public void copyTo(AttributeImpl target) {
ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
other.bytes.copyBytes(bytes);
}
@Override
public CharSequence toUTF16() {
if (charsRef == null) {
charsRef = new CharsRefBuilder();
}
charsRef.copyUTF8Bytes(getBytesRef());
return charsRef.get();
}
}
}

View File

@ -0,0 +1,324 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.CollectionTerminatedException;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.suggest.analyzing.FSTUtil;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseDocID;
import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseSurfaceForm;
/**
* <p>
* NRTSuggester returns Top N completions with corresponding documents matching a provided automaton.
* The completions are returned in descending order of their corresponding weight.
* Deleted documents are filtered out in near real time using the provided reader.
* A {@link org.apache.lucene.search.DocIdSet} can be passed in at query time to filter out documents.
* </p>
* <p>
* See {@link #lookup(LeafReader, Automaton, int, DocIdSet, TopSuggestDocsCollector)} for more implementation
* details.
* <p>
* Builder: {@link NRTSuggesterBuilder}
* </p>
* <p>
* FST Format:
* <ul>
* <li>Input: analyzed forms of input terms</li>
* <li>Output: Pair&lt;Long, BytesRef&gt; containing weight, surface form and docID</li>
* </ul>
* <p>
* NOTE:
* <ul>
* <li>currently only {@link org.apache.lucene.search.DocIdSet} with random access capabilities are supported.</li>
* <li>having too many deletions or using a very restrictive filter can make the search inadmissible due to
* over-pruning of potential paths</li>
* <li>when a {@link org.apache.lucene.search.DocIdSet} is used, it is assumed that the filter will roughly
* filter out half the number of documents that match the provided automaton</li>
* <li>lookup performance will degrade as more accepted completions lead to filtered out documents</li>
* </ul>
*
*/
final class NRTSuggester implements Accountable {
/**
* FST<Weight,Surface>:
* input is the analyzed form, with a null byte between terms
* and a {@link NRTSuggesterBuilder#END_BYTE} to denote the
* end of the input
* weight is a long
* surface is the original, unanalyzed form followed by the docID
*/
private final FST<Pair<Long, BytesRef>> fst;
/**
* Highest number of analyzed paths we saw for any single
* input surface form. This can be > 1, when index analyzer
* creates graphs or if multiple surface form(s) yields the
* same analyzed form
*/
private final int maxAnalyzedPathsPerOutput;
/**
* Separator used between surface form and its docID in the FST output
*/
private final int payloadSep;
/**
* Label used to denote the end of an input in the FST and
* the beginning of dedup bytes
*/
private final int endByte;
/**
* Maximum queue depth for TopNSearcher
*
* NOTE: value should be <= Integer.MAX_VALUE
*/
private static final long MAX_TOP_N_QUEUE_SIZE = 1000;
private NRTSuggester(FST<Pair<Long, BytesRef>> fst, int maxAnalyzedPathsPerOutput, int payloadSep, int endByte) {
this.fst = fst;
this.maxAnalyzedPathsPerOutput = maxAnalyzedPathsPerOutput;
this.payloadSep = payloadSep;
this.endByte = endByte;
}
@Override
public long ramBytesUsed() {
return fst == null ? 0 : fst.ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.emptyList();
}
private static Comparator<Pair<Long, BytesRef>> getComparator() {
return new Comparator<Pair<Long, BytesRef>>() {
@Override
public int compare(Pair<Long, BytesRef> o1, Pair<Long, BytesRef> o2) {
return Long.compare(o1.output1, o2.output1);
}
};
}
/**
* Collects at most Top <code>num</code> completions, filtered by <code>filter</code> on
* corresponding documents, which has a prefix accepted by <code>automaton</code>
* <p>
* Supports near real time deleted document filtering using <code>reader</code>
* <p>
* {@link TopSuggestDocsCollector#collect(int, CharSequence, long)} is called
* for every matched completion
* <p>
* Completion collection can be early terminated by throwing {@link org.apache.lucene.search.CollectionTerminatedException}
*/
public void lookup(final LeafReader reader, final Automaton automaton, final int num, final DocIdSet filter, final TopSuggestDocsCollector collector) {
final Bits filterDocs;
try {
if (filter != null) {
if (filter.iterator() == null) {
return;
}
if (filter.bits() == null) {
throw new IllegalArgumentException("DocIDSet does not provide random access interface");
} else {
filterDocs = filter.bits();
}
} else {
filterDocs = null;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
int queueSize = getMaxTopNSearcherQueueSize(num, reader, filterDocs != null);
if (queueSize == -1) {
return;
}
final Bits liveDocs = reader.getLiveDocs();
try {
final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst);
Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num, queueSize, getComparator()) {
private final CharsRefBuilder spare = new CharsRefBuilder();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
int payloadSepIndex = parseSurfaceForm(output.output2, payloadSep, spare);
int docID = parseDocID(output.output2, payloadSepIndex);
// filter out deleted docs only if no filter is set
if (filterDocs == null && liveDocs != null && !liveDocs.get(docID)) {
return false;
}
// filter by filter context
if (filterDocs != null && !filterDocs.get(docID)) {
return false;
}
try {
collector.collect(docID, spare.toCharsRef(), decode(output.output1));
return true;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
// TODO: add fuzzy support
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, false, path.input);
}
try {
// hits are also returned by search()
// we do not use it, instead collect at acceptResult
Util.TopResults<Pair<Long, BytesRef>> search = searcher.search();
// search admissibility is not guaranteed
// see comment on getMaxTopNSearcherQueueSize
// assert search.isComplete;
} catch (CollectionTerminatedException e) {
// terminate
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
/**
* Simple heuristics to try to avoid over-pruning potential suggestions by the
* TopNSearcher. Since suggestion entries can be rejected if they belong
* to a deleted document, the length of the TopNSearcher queue has to
* be increased by some factor, to account for the filtered out suggestions.
* This heuristic will try to make the searcher admissible, but the search
* can still lead to over-pruning
* <p>
* If a <code>filter</code> is applied, the queue size is increased by
* half the number of live documents.
* <p>
* The maximum queue size is {@link #MAX_TOP_N_QUEUE_SIZE}
*/
private int getMaxTopNSearcherQueueSize(int num, LeafReader reader, boolean filterEnabled) {
double liveDocsRatio = calculateLiveDocRatio(reader.numDocs(), reader.maxDoc());
if (liveDocsRatio == -1) {
return -1;
}
long maxQueueSize = num * maxAnalyzedPathsPerOutput;
// liveDocRatio can be at most 1.0 (if no docs were deleted)
assert liveDocsRatio <= 1.0d;
maxQueueSize = (long) (maxQueueSize / liveDocsRatio);
if (filterEnabled) {
maxQueueSize = maxQueueSize + (reader.numDocs()/2);
}
return (int) Math.min(MAX_TOP_N_QUEUE_SIZE, maxQueueSize);
}
private static double calculateLiveDocRatio(int numDocs, int maxDocs) {
return (numDocs > 0) ? ((double) numDocs / maxDocs) : -1;
}
/**
* Loads a {@link NRTSuggester} from {@link org.apache.lucene.store.IndexInput}
*/
public static NRTSuggester load(IndexInput input) throws IOException {
final FST<Pair<Long, BytesRef>> fst = new FST<>(input, new PairOutputs<>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
/* read some meta info */
int maxAnalyzedPathsPerOutput = input.readVInt();
int endByte = input.readVInt();
int payloadSep = input.readVInt();
return new NRTSuggester(fst, maxAnalyzedPathsPerOutput, payloadSep, endByte);
}
static long encode(long input) {
if (input < 0) {
throw new UnsupportedOperationException("cannot encode value: " + input);
}
return Long.MAX_VALUE - input;
}
static long decode(long output) {
return (Long.MAX_VALUE - output);
}
/**
* Helper to encode/decode payload (surface + PAYLOAD_SEP + docID) output
*/
static final class PayLoadProcessor {
final static private int MAX_DOC_ID_LEN_WITH_SEP = 6; // vint takes at most 5 bytes
static int parseSurfaceForm(final BytesRef output, int payloadSep, CharsRefBuilder spare) {
int surfaceFormLen = -1;
for (int i = 0; i < output.length; i++) {
if (output.bytes[output.offset + i] == payloadSep) {
surfaceFormLen = i;
break;
}
}
assert surfaceFormLen != -1 : "no payloadSep found, unable to determine surface form";
spare.copyUTF8Bytes(output.bytes, output.offset, surfaceFormLen);
return surfaceFormLen;
}
static int parseDocID(final BytesRef output, int payloadSepIndex) {
assert payloadSepIndex != -1 : "payload sep index can not be -1";
ByteArrayDataInput input = new ByteArrayDataInput(output.bytes, payloadSepIndex + output.offset + 1, output.length - (payloadSepIndex + output.offset));
return input.readVInt();
}
static BytesRef make(final BytesRef surface, int docID, int payloadSep) throws IOException {
int len = surface.length + MAX_DOC_ID_LEN_WITH_SEP;
byte[] buffer = new byte[len];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
output.writeBytes(surface.bytes, surface.length - surface.offset);
output.writeByte((byte) payloadSep);
output.writeVInt(docID);
return new BytesRef(buffer, 0, output.getPosition());
}
}
}

View File

@ -0,0 +1,165 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.PriorityQueue;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import static org.apache.lucene.search.suggest.document.NRTSuggester.encode;
/**
* Builder for {@link NRTSuggester}
*
*/
final class NRTSuggesterBuilder {
/**
* Label used to separate surface form and docID
* in the output
*/
public static final int PAYLOAD_SEP = '\u001F';
/**
* Marks end of the analyzed input and start of dedup
* byte.
*/
private static final int END_BYTE = 0x0;
private final PairOutputs<Long, BytesRef> outputs;
private final Builder<PairOutputs.Pair<Long, BytesRef>> builder;
private final IntsRefBuilder scratchInts = new IntsRefBuilder();
private final BytesRefBuilder analyzed = new BytesRefBuilder();
private final PriorityQueue<Entry> entries;
private final int payloadSep;
private final int endByte;
private int maxAnalyzedPathsPerOutput = 0;
/**
* Create a builder for {@link NRTSuggester}
*/
public NRTSuggesterBuilder() {
this.payloadSep = PAYLOAD_SEP;
this.endByte = END_BYTE;
this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
this.entries = new PriorityQueue<>();
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
}
/**
* Initializes an FST input term to add entries against
*/
public void startTerm(BytesRef analyzed) {
this.analyzed.copyBytes(analyzed);
this.analyzed.append((byte) endByte);
}
/**
* Adds an entry for the latest input term, should be called after
* {@link #startTerm(org.apache.lucene.util.BytesRef)} on the desired input
*/
public void addEntry(int docID, BytesRef surfaceForm, long weight) throws IOException {
BytesRef payloadRef = NRTSuggester.PayLoadProcessor.make(surfaceForm, docID, payloadSep);
entries.add(new Entry(payloadRef, encode(weight)));
}
/**
* Writes all the entries for the FST input term
*/
public void finishTerm() throws IOException {
int numArcs = 0;
int numDedupBytes = 1;
analyzed.grow(analyzed.length() + 1);
analyzed.setLength(analyzed.length() + 1);
for (Entry entry : entries) {
if (numArcs == maxNumArcsForDedupByte(numDedupBytes)) {
analyzed.setByteAt(analyzed.length() - 1, (byte) (numArcs));
analyzed.grow(analyzed.length() + 1);
analyzed.setLength(analyzed.length() + 1);
numArcs = 0;
numDedupBytes++;
}
analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
Util.toIntsRef(analyzed.get(), scratchInts);
builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
}
maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
entries.clear();
}
/**
* Builds and stores a FST that can be loaded with
* {@link NRTSuggester#load(org.apache.lucene.store.IndexInput)}
*/
public boolean store(DataOutput output) throws IOException {
final FST<PairOutputs.Pair<Long, BytesRef>> build = builder.finish();
if (build == null) {
return false;
}
build.save(output);
/* write some more meta-info */
assert maxAnalyzedPathsPerOutput > 0;
output.writeVInt(maxAnalyzedPathsPerOutput);
output.writeVInt(END_BYTE);
output.writeVInt(PAYLOAD_SEP);
return true;
}
/**
* Num arcs for nth dedup byte:
* if n <= 5: 1 + (2 * n)
* else: (1 + (2 * n)) * n
* <p>
* TODO: is there a better way to make the fst built to be
* more TopNSearcher friendly?
*/
private static int maxNumArcsForDedupByte(int currentNumDedupBytes) {
int maxArcs = 1 + (2 * currentNumDedupBytes);
if (currentNumDedupBytes > 5) {
maxArcs *= currentNumDedupBytes;
}
return Math.min(maxArcs, 255);
}
private final static class Entry implements Comparable<Entry> {
final BytesRef payload;
final long weight;
public Entry(BytesRef payload, long weight) {
this.payload = payload;
this.weight = weight;
}
@Override
public int compareTo(Entry o) {
return Long.compare(weight, o.weight);
}
}
}

View File

@ -0,0 +1,123 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.BytesRef;
/**
* <p>
* Field that indexes a string value and a weight as a weighted completion
* against a named suggester.
* Field is tokenized, not stored and stores documents, frequencies and positions.
* Field can be used to provide near real time document suggestions.
* </p>
* <p>
* Besides the usual {@link org.apache.lucene.analysis.Analyzer}s,
* {@link CompletionAnalyzer}
* can be used to tune suggest field only parameters
* (e.g. preserving token seperators, preserving position increments
* when converting the token stream to an automaton)
* </p>
* <p>
* Example indexing usage:
* <pre class="prettyprint">
* document.add(new SuggestField(name, "suggestion", 4));
* </pre>
* To perform document suggestions based on the this field, use
* {@link SuggestIndexSearcher#suggest(String, CharSequence, int, org.apache.lucene.search.Filter)}
* <p>
* Example query usage:
* <pre class="prettyprint">
* SuggestIndexSearcher indexSearcher = ..
* indexSearcher.suggest(name, "su", 2)
* </pre>
*
* @lucene.experimental
*/
public class SuggestField extends Field {
private static final FieldType FIELD_TYPE = new FieldType();
static {
FIELD_TYPE.setTokenized(true);
FIELD_TYPE.setStored(false);
FIELD_TYPE.setStoreTermVectors(false);
FIELD_TYPE.setOmitNorms(false);
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
FIELD_TYPE.freeze();
}
private final BytesRef surfaceForm;
private final long weight;
/**
* Creates a {@link SuggestField}
*
* @param name of the field
* @param value to get suggestions on
* @param weight weight of the suggestion
*/
public SuggestField(String name, String value, long weight) {
super(name, value, FIELD_TYPE);
if (weight < 0l) {
throw new IllegalArgumentException("weight must be >= 0");
}
this.surfaceForm = new BytesRef(value);
this.weight = weight;
}
@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
TokenStream stream = super.tokenStream(analyzer, reuse);
CompletionTokenStream completionStream;
if (stream instanceof CompletionTokenStream) {
completionStream = (CompletionTokenStream) stream;
} else {
completionStream = new CompletionTokenStream(stream);
}
BytesRef suggestPayload = buildSuggestPayload(surfaceForm, weight, (char) completionStream.sepLabel());
completionStream.setPayload(suggestPayload);
return completionStream;
}
private BytesRef buildSuggestPayload(BytesRef surfaceForm, long weight, char sepLabel) throws IOException {
for (int i = 0; i < surfaceForm.length; i++) {
if (surfaceForm.bytes[i] == sepLabel) {
assert sepLabel == '\u001f';
throw new IllegalArgumentException(
"surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
output.writeVInt(surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeVLong(weight + 1);
}
return new BytesRef(byteArrayOutputStream.toByteArray());
}
}

View File

@ -0,0 +1,150 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.automaton.Automaton;
import static org.apache.lucene.search.suggest.document.CompletionFieldsProducer.CompletionTerms;
/**
* Adds document suggest capabilities to IndexSearcher
*
* @lucene.experimental
*/
public class SuggestIndexSearcher extends IndexSearcher {
private final Analyzer queryAnalyzer;
/**
* Creates a searcher with document suggest capabilities
* for <code>reader</code>.
* <p>
* Suggestion <code>key</code> is analyzed with <code>queryAnalyzer</code>
*/
public SuggestIndexSearcher(IndexReader reader, Analyzer queryAnalyzer) {
super(reader);
this.queryAnalyzer = queryAnalyzer;
}
/**
* Calls {@link #suggest(String, CharSequence, int, Filter)}
* with no document filter
*/
public TopSuggestDocs suggest(String field, CharSequence key, int num) throws IOException {
return suggest(field, key, num, (Filter) null);
}
/**
* Calls {@link #suggest(String, CharSequence, int, Filter, TopSuggestDocsCollector)}
* with no document filter
*/
public void suggest(String field, CharSequence key, int num, TopSuggestDocsCollector collector) throws IOException {
suggest(field, key, num, null, collector);
}
/**
* Suggests at most <code>num</code> documents filtered by <code>filter</code>
* that completes to <code>key</code> for a suggest <code>field</code>
* <p>
* Returns at most Top <code>num</code> document ids with corresponding completion and weight pair
*
* @throws java.lang.IllegalArgumentException if <code>filter</code> does not provide a random access
* interface or if <code>field</code> is not a {@link SuggestField}
*/
public TopSuggestDocs suggest(String field, CharSequence key, int num, Filter filter) throws IOException {
TopSuggestDocsCollector collector = new TopSuggestDocsCollector(num);
suggest(field, key, num, filter, collector);
return collector.get();
}
/**
* Suggests at most <code>num</code> documents filtered by <code>filter</code>
* that completes to <code>key</code> for a suggest <code>field</code>
* <p>
* Collect completions with {@link TopSuggestDocsCollector}
* The completions are collected in order of the suggest <code>field</code> weight.
* There can be more than one collection of the same document, if the <code>key</code>
* matches multiple <code>field</code> values of the same document
*
* @throws java.lang.IllegalArgumentException if <code>filter</code> does not provide a random access
* interface or if <code>field</code> is not a {@link SuggestField}
*/
public void suggest(String field, CharSequence key, int num, Filter filter, TopSuggestDocsCollector collector) throws IOException {
// verify input
if (field == null) {
throw new IllegalArgumentException("'field' can not be null");
}
if (num <= 0) {
throw new IllegalArgumentException("'num' should be > 0");
}
if (collector == null) {
throw new IllegalArgumentException("'collector' can not be null");
}
// build query automaton
CompletionAnalyzer analyzer;
if (queryAnalyzer instanceof CompletionAnalyzer) {
analyzer = (CompletionAnalyzer) queryAnalyzer;
} else {
analyzer = new CompletionAnalyzer(queryAnalyzer);
}
final Automaton automaton = analyzer.toAutomaton(field, key);
// collect results
for (LeafReaderContext context : getIndexReader().leaves()) {
TopSuggestDocsCollector leafCollector = (TopSuggestDocsCollector) collector.getLeafCollector(context);
LeafReader reader = context.reader();
Terms terms = reader.terms(field);
if (terms == null) {
continue;
}
NRTSuggester suggester;
if (terms instanceof CompletionTerms) {
CompletionTerms completionTerms = (CompletionTerms) terms;
suggester = completionTerms.suggester();
} else {
throw new IllegalArgumentException(field + " is not a SuggestField");
}
if (suggester == null) {
// a segment can have a null suggester
// i.e. no FST was built
continue;
}
DocIdSet docIdSet = null;
if (filter != null) {
docIdSet = filter.getDocIdSet(context, reader.getLiveDocs());
if (docIdSet == null) {
// filter matches no docs in current leave
continue;
}
}
suggester.lookup(reader, automaton, num, docIdSet, leafCollector);
}
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;
import org.apache.lucene.util.PriorityQueue;
/**
* Bounded priority queue for {@link SuggestScoreDoc}s.
* Priority is based on {@link SuggestScoreDoc#score} and tie
* is broken by {@link SuggestScoreDoc#doc}
*/
final class SuggestScoreDocPriorityQueue extends PriorityQueue<SuggestScoreDoc> {
/**
* Creates a new priority queue of the specified size.
*/
public SuggestScoreDocPriorityQueue(int size) {
super(size);
}
@Override
protected boolean lessThan(SuggestScoreDoc a, SuggestScoreDoc b) {
if (a.score == b.score) {
// prefer smaller doc id, in case of a tie
return a.doc > b.doc;
}
return a.score < b.score;
}
/**
* Returns the top N results in descending order.
*/
public SuggestScoreDoc[] getResults() {
int size = size();
SuggestScoreDoc[] res = new SuggestScoreDoc[size];
for (int i = size - 1; i >= 0; i--) {
res[i] = pop();
}
return res;
}
}

View File

@ -0,0 +1,111 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.suggest.Lookup;
/**
* {@link org.apache.lucene.search.TopDocs} wrapper with
* an additional CharSequence key per {@link org.apache.lucene.search.ScoreDoc}
*
* @lucene.experimental
*/
public class TopSuggestDocs extends TopDocs {
/**
* Singleton for empty {@link TopSuggestDocs}
*/
public final static TopSuggestDocs EMPTY = new TopSuggestDocs(0, new SuggestScoreDoc[0], 0);
/**
* {@link org.apache.lucene.search.ScoreDoc} with an
* additional CharSequence key
*/
public static class SuggestScoreDoc extends ScoreDoc implements Comparable<SuggestScoreDoc> {
/**
* Matched completion key
*/
public CharSequence key;
/**
* Creates a SuggestScoreDoc instance
*
* @param doc document id (hit)
* @param key matched completion
* @param score weight of the matched completion
*/
public SuggestScoreDoc(int doc, CharSequence key, long score) {
// loss of precision but not magnitude
// implicit conversion from long -> float
super(doc, score);
this.key = key;
}
@Override
public int compareTo(SuggestScoreDoc o) {
return Lookup.CHARSEQUENCE_COMPARATOR.compare(key, o.key);
}
}
/**
* {@link org.apache.lucene.search.TopDocs} wrapper with
* {@link TopSuggestDocs.SuggestScoreDoc}
* instead of {@link org.apache.lucene.search.ScoreDoc}
*/
public TopSuggestDocs(int totalHits, SuggestScoreDoc[] scoreDocs, float maxScore) {
super(totalHits, scoreDocs, maxScore);
}
/**
* Returns {@link TopSuggestDocs.SuggestScoreDoc}s
* for this instance
*/
public SuggestScoreDoc[] scoreLookupDocs() {
return (SuggestScoreDoc[]) scoreDocs;
}
/**
* Returns a new TopSuggestDocs, containing topN results across
* the provided TopSuggestDocs, sorting by score. Each {@link TopSuggestDocs}
* instance must be sorted.
* Analogous to {@link org.apache.lucene.search.TopDocs#merge(int, org.apache.lucene.search.TopDocs[])}
* for {@link TopSuggestDocs}
*
* NOTE: assumes every <code>shardHit</code> is already sorted by score
*/
public static TopSuggestDocs merge(int topN, TopSuggestDocs[] shardHits) {
SuggestScoreDocPriorityQueue priorityQueue = new SuggestScoreDocPriorityQueue(topN);
for (TopSuggestDocs shardHit : shardHits) {
for (SuggestScoreDoc scoreDoc : shardHit.scoreLookupDocs()) {
if (scoreDoc == priorityQueue.insertWithOverflow(scoreDoc)) {
break;
}
}
}
SuggestScoreDoc[] topNResults = priorityQueue.getResults();
if (topNResults.length > 0) {
return new TopSuggestDocs(topNResults.length, topNResults, topNResults[0].score);
} else {
return TopSuggestDocs.EMPTY;
}
}
}

View File

@ -0,0 +1,118 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionTerminatedException;
import org.apache.lucene.search.SimpleCollector;
import static org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;
/**
* {@link org.apache.lucene.search.Collector} that collects completion and
* score, along with document id
* <p>
* Non scoring collector that collect completions in order of their
* pre-defined weight.
* <p>
* NOTE: One document can be collected multiple times if a document
* is matched for multiple unique completions for a given query
* <p>
* Subclasses should only override {@link TopSuggestDocsCollector#collect(int, CharSequence, long)},
* {@link #setScorer(org.apache.lucene.search.Scorer)} is not
* used
*
* @lucene.experimental
*/
public class TopSuggestDocsCollector extends SimpleCollector {
private final SuggestScoreDocPriorityQueue priorityQueue;
/**
* Document base offset for the current Leaf
*/
protected int docBase;
/**
* Sole constructor
*
* Collects at most <code>num</code> completions
* with corresponding document and weight
*/
public TopSuggestDocsCollector(int num) {
if (num <= 0) {
throw new IllegalArgumentException("'num' must be > 0");
}
this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
docBase = context.docBase;
}
/**
* Called for every matched completion,
* similar to {@link org.apache.lucene.search.LeafCollector#collect(int)}
* but for completions.
*
* NOTE: collection at the leaf level is guaranteed to be in
* descending order of score
*/
public void collect(int docID, CharSequence key, long score) throws IOException {
SuggestScoreDoc current = new SuggestScoreDoc(docBase + docID, key, score);
if (current == priorityQueue.insertWithOverflow(current)) {
// if the current SuggestScoreDoc has overflown from pq,
// we can assume all of the successive collections from
// this leaf will be overflown as well
// TODO: reuse the overflow instance?
throw new CollectionTerminatedException();
}
}
/**
* Returns at most <code>num</code> Top scoring {@link org.apache.lucene.search.suggest.document.TopSuggestDocs}s
*/
public TopSuggestDocs get() throws IOException {
SuggestScoreDoc[] suggestScoreDocs = priorityQueue.getResults();
if (suggestScoreDocs.length > 0) {
return new TopSuggestDocs(suggestScoreDocs.length, suggestScoreDocs, suggestScoreDocs[0].score);
} else {
return TopSuggestDocs.EMPTY;
}
}
/**
* Ignored
*/
@Override
public void collect(int doc) throws IOException {
// {@link #collect(int, CharSequence, long)} is used
// instead
}
/**
* Ignored
*/
@Override
public boolean needsScores() {
return false;
}
}

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Support for document suggestion
*/
package org.apache.lucene.search.suggest.document;

View File

@ -0,0 +1,33 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.search.suggest.document.Completion50PostingsFormat

View File

@ -0,0 +1,786 @@
package org.apache.lucene.search.suggest.document;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.CyclicBarrier;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import static org.apache.lucene.search.suggest.document.TopSuggestDocs.*;
import static org.hamcrest.core.IsEqual.equalTo;
public class SuggestFieldTest extends LuceneTestCase {
public Directory dir;
@Before
public void before() throws Exception {
dir = newDirectory();
}
@After
public void after() throws Exception {
dir.close();
}
@Test
public void testSimple() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
Document document = new Document();
document.add(newSuggestField("suggest_field", "abc", 3l));
document.add(newSuggestField("suggest_field", "abd", 4l));
document.add(newSuggestField("suggest_field", "The Foo Fighters", 2l));
iw.addDocument(document);
document.clear();
document.add(newSuggestField("suggest_field", "abcdd", 5));
iw.addDocument(document);
if (rarely()) {
iw.commit();
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs lookupDocs = suggestIndexSearcher.suggest("suggest_field", "ab", 3);
assertSuggestions(lookupDocs, new Entry("abcdd", 5), new Entry("abd", 4), new Entry("abc", 3));
reader.close();
iw.close();
}
@Test
public void testMultipleSuggestFieldsPerDoc() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "sug_field_1", "sug_field_2"));
Document document = new Document();
document.add(newSuggestField("sug_field_1", "apple", 4));
document.add(newSuggestField("sug_field_2", "april", 3));
iw.addDocument(document);
document.clear();
document.add(newSuggestField("sug_field_1", "aples", 3));
document.add(newSuggestField("sug_field_2", "apartment", 2));
iw.addDocument(document);
if (rarely()) {
iw.commit();
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggestDocs1 = suggestIndexSearcher.suggest("sug_field_1", "ap", 4);
assertSuggestions(suggestDocs1, new Entry("apple", 4), new Entry("aples", 3));
TopSuggestDocs suggestDocs2 = suggestIndexSearcher.suggest("sug_field_2", "ap", 4);
assertSuggestions(suggestDocs2, new Entry("april", 3), new Entry("apartment", 2));
// check that the doc ids are consistent
for (int i = 0; i < suggestDocs1.scoreDocs.length; i++) {
ScoreDoc suggestScoreDoc = suggestDocs1.scoreDocs[i];
assertThat(suggestScoreDoc.doc, equalTo(suggestDocs2.scoreDocs[i].doc));
}
reader.close();
iw.close();
}
@Test
public void testDupSuggestFieldValues() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(300);
long[] weights = new long[num];
for(int i = 0; i < num; i++) {
Document document = new Document();
weights[i] = Math.abs(random().nextLong());
document.add(newSuggestField("suggest_field", "abc", weights[i]));
iw.addDocument(document);
}
if (rarely()) {
iw.commit();
}
DirectoryReader reader = iw.getReader();
Entry[] expectedEntries = new Entry[num];
Arrays.sort(weights);
for (int i = 1; i <= num; i++) {
expectedEntries[i - 1] = new Entry("abc", weights[num - i]);
}
SuggestIndexSearcher suggestIndexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs lookupDocs = suggestIndexSearcher.suggest("suggest_field", "abc", num);
assertSuggestions(lookupDocs, expectedEntries);
reader.close();
iw.close();
}
@Test
public void testNRTDeletedDocFiltering() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
// using IndexWriter instead of RandomIndexWriter
IndexWriter iw = new IndexWriter(dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
int numLive = 0;
List<Entry> expectedEntries = new ArrayList<>();
for (int i = 0; i < num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, num - i));
if (i % 2 == 0) {
document.add(newStringField("str_field", "delete", Field.Store.YES));
} else {
numLive++;
expectedEntries.add(new Entry("abc_" + i, num - i));
document.add(newStringField("str_field", "no_delete", Field.Store.YES));
}
iw.addDocument(document);
document.clear();
}
// get docIDs to delete
DirectoryReader reader = DirectoryReader.open(iw, false);
List<Integer> docIdsToDelete = new ArrayList<>();
for (int i = 0; i < reader.maxDoc(); i++) {
StoredDocument doc = reader.document(i);
if ("delete".equals(doc.get("str_field"))) {
docIdsToDelete.add(i);
}
}
for (Integer docID : docIdsToDelete) {
assertTrue(iw.tryDeleteDocument(reader, docID));
}
reader.close();
reader = DirectoryReader.open(iw, false);
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", numLive);
assertSuggestions(suggest, expectedEntries.toArray(new Entry[expectedEntries.size()]));
reader.close();
iw.close();
}
@Test
public void testSuggestOnAllFilteredDocuments() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
for (int i = 0; i < num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
document.add(newStringField("str_fld", "deleted", Field.Store.NO));
iw.addDocument(document);
document.clear();
}
Filter filter = new QueryWrapperFilter(new TermsQuery("str_fld", new BytesRef("non_existent")));
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
// no random access required;
// calling suggest with filter that does not match any documents should early terminate
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", num, filter);
assertThat(suggest.totalHits, equalTo(0));
reader.close();
iw.close();
}
@Test
public void testSuggestOnAllDeletedDocuments() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
// using IndexWriter instead of RandomIndexWriter
IndexWriter iw = new IndexWriter(dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
for (int i = 0; i < num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
iw.addDocument(document);
document.clear();
}
DirectoryReader reader = DirectoryReader.open(iw, false);
for (int docID = 0; docID < reader.maxDoc(); docID++) {
assertTrue(iw.tryDeleteDocument(reader, docID));
}
reader.close();
reader = DirectoryReader.open(iw, false);
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", num);
assertThat(suggest.totalHits, equalTo(0));
reader.close();
iw.close();
}
@Test
public void testSuggestOnMostlyDeletedDocuments() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
// using IndexWriter instead of RandomIndexWriter
IndexWriter iw = new IndexWriter(dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
for (int i = 1; i <= num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
document.add(new IntField("weight_fld", i, Field.Store.YES));
iw.addDocument(document);
document.clear();
}
DirectoryReader reader = DirectoryReader.open(iw, false);
// delete all but the lowest scored suggestion
for (int docID = 0; docID < reader.maxDoc(); docID++) {
StoredDocument doc = reader.document(docID);
StorableField[] weights = doc.getFields("weight_fld");
assertThat(weights.length, equalTo(1));
int weight = (int) weights[0].numericValue();
if (weight != 1) {
assertTrue(iw.tryDeleteDocument(reader, docID));
}
}
reader.close();
reader = DirectoryReader.open(iw, false);
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", 1);
assertSuggestions(suggest, new Entry("abc_1", 1));
reader.close();
iw.close();
}
@Test
public void testSuggestOnMostlyFilteredOutDocuments() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
for (int i = 0; i < num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
document.add(new IntField("filter_int_fld", i, Field.Store.NO));
iw.addDocument(document);
document.clear();
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
int topScore = num/2;
QueryWrapperFilter filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", 0, topScore, true, true));
Filter filter = randomAccessFilter(filterWrapper);
// if at most half of the top scoring documents have been filtered out
// the search should be admissible
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", 1, filter);
assertSuggestions(suggest, new Entry("abc_" + topScore, topScore));
filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", 0, 0, true, true));
filter = randomAccessFilter(filterWrapper);
// if more than half of the top scoring documents have been filtered out
// search is not admissible, so # of suggestions requested is num instead of 1
suggest = indexSearcher.suggest("suggest_field", "abc_", num, filter);
assertSuggestions(suggest, new Entry("abc_0", 0));
filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", num - 1, num - 1, true, true));
filter = randomAccessFilter(filterWrapper);
// if only lower scoring documents are filtered out
// search is admissible
suggest = indexSearcher.suggest("suggest_field", "abc_", 1, filter);
assertSuggestions(suggest, new Entry("abc_" + (num - 1), num - 1));
reader.close();
iw.close();
}
@Test
public void testEarlyTermination() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
// have segments of 4 documents
// with descending suggestion weights
// suggest should early terminate for
// segments with docs having lower suggestion weights
for (int i = num; i > 0; i--) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
iw.addDocument(document);
document.clear();
if (i % 4 == 0) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", 1);
assertSuggestions(suggest, new Entry("abc_" + num, num));
reader.close();
iw.close();
}
@Test
public void testMultipleSegments() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(10);
Document document = new Document();
List<Entry> entries = new ArrayList<>();
// ensure at least some segments have no suggest field
for (int i = num; i > 0; i--) {
if (random().nextInt(4) == 1) {
document.add(newSuggestField("suggest_field", "abc_" + i, i));
entries.add(new Entry("abc_" + i, i));
}
document.add(new IntField("weight_fld", i, Field.Store.YES));
iw.addDocument(document);
document.clear();
if (usually()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", (entries.size() == 0) ? 1 : entries.size());
assertSuggestions(suggest, entries.toArray(new Entry[entries.size()]));
reader.close();
iw.close();
}
@Test
public void testDocFiltering() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
Document document = new Document();
document.add(new IntField("filter_int_fld", 9, Field.Store.NO));
document.add(newSuggestField("suggest_field", "apples", 3));
iw.addDocument(document);
document.clear();
document.add(new IntField("filter_int_fld", 10, Field.Store.NO));
document.add(newSuggestField("suggest_field", "applle", 4));
iw.addDocument(document);
document.clear();
document.add(new IntField("filter_int_fld", 4, Field.Store.NO));
document.add(newSuggestField("suggest_field", "apple", 5));
iw.addDocument(document);
if (rarely()) {
iw.commit();
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
// suggest without filter
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "app", 3);
assertSuggestions(suggest, new Entry("apple", 5), new Entry("applle", 4), new Entry("apples", 3));
// suggest with filter
QueryWrapperFilter filterWrapper = new QueryWrapperFilter(NumericRangeQuery.newIntRange("filter_int_fld", 5, 12, true, true));
Filter filter = randomAccessFilter(filterWrapper);
suggest = indexSearcher.suggest("suggest_field", "app", 3, filter);
assertSuggestions(suggest, new Entry("applle", 4), new Entry("apples", 3));
reader.close();
iw.close();
}
@Test
public void testReturnedDocID() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
Document document = new Document();
int num = atLeast(10);
for (int i = 0; i < num; i++) {
document.add(newSuggestField("suggest_field", "abc_" + i, num));
document.add(new IntField("int_field", i, Field.Store.YES));
iw.addDocument(document);
document.clear();
if (random().nextBoolean()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", "abc_", num);
assertEquals(num, suggest.totalHits);
for (SuggestScoreDoc suggestScoreDoc : suggest.scoreLookupDocs()) {
String key = suggestScoreDoc.key.toString();
assertTrue(key.startsWith("abc_"));
String substring = key.substring(4);
int fieldValue = Integer.parseInt(substring);
StoredDocument doc = reader.document(suggestScoreDoc.doc);
assertEquals(doc.getField("int_field").numericValue().intValue(), fieldValue);
}
reader.close();
iw.close();
}
@Test
public void testCompletionAnalyzerOptions() throws Exception {
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
Map<String, Analyzer> map = new HashMap<>();
map.put("suggest_field_default", new CompletionAnalyzer(analyzer));
CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer, false, true);
map.put("suggest_field_no_p_sep", completionAnalyzer);
completionAnalyzer = new CompletionAnalyzer(analyzer, true, false);
map.put("suggest_field_no_p_pos_inc", completionAnalyzer);
completionAnalyzer = new CompletionAnalyzer(analyzer, false, false);
map.put("suggest_field_no_p_sep_or_pos_inc", completionAnalyzer);
PerFieldAnalyzerWrapper analyzerWrapper = new PerFieldAnalyzerWrapper(analyzer, map);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzerWrapper, map.keySet()));
Document document = new Document();
document.add(newSuggestField("suggest_field_default", "foobar", 7));
document.add(newSuggestField("suggest_field_default", "foo bar", 8));
document.add(newSuggestField("suggest_field_default", "the fo", 9));
document.add(newSuggestField("suggest_field_default", "the foo bar", 10));
document.add(newSuggestField("suggest_field_no_p_sep", "foobar", 7));
document.add(newSuggestField("suggest_field_no_p_sep", "foo bar", 8));
document.add(newSuggestField("suggest_field_no_p_sep", "the fo", 9));
document.add(newSuggestField("suggest_field_no_p_sep", "the foo bar", 10));
document.add(newSuggestField("suggest_field_no_p_pos_inc", "foobar", 7));
document.add(newSuggestField("suggest_field_no_p_pos_inc", "foo bar", 8));
document.add(newSuggestField("suggest_field_no_p_pos_inc", "the fo", 9));
document.add(newSuggestField("suggest_field_no_p_pos_inc", "the foo bar", 10));
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "foobar", 7));
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "foo bar", 8));
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "the fo", 9));
document.add(newSuggestField("suggest_field_no_p_sep_or_pos_inc", "the foo bar", 10));
iw.addDocument(document);
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
TopSuggestDocs suggest;
suggest = indexSearcher.suggest("suggest_field_default", "fo", 4);
assertSuggestions(suggest, new Entry("foo bar", 8), new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_default", "foob", 4);
assertSuggestions(suggest, new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_sep", "fo", 4); // matches all 4
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_sep", "foob", 4); // except the fo
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_pos_inc", "fo", 4); //matches all 4
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_pos_inc", "foob", 4); // only foobar
assertSuggestions(suggest, new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_sep_or_pos_inc", "fo", 4); // all 4
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("the fo", 9), new Entry("foo bar", 8), new Entry("foobar", 7));
suggest = indexSearcher.suggest("suggest_field_no_p_sep_or_pos_inc", "foob", 4); // not the fo
assertSuggestions(suggest, new Entry("the foo bar", 10), new Entry("foo bar", 8), new Entry("foobar", 7));
reader.close();
iw.close();
}
@Test
public void testScoring() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
int num = atLeast(100);
String[] prefixes = {"abc", "bac", "cab"};
Map<String, Long> mappings = new HashMap<>();
for (int i = 0; i < num; i++) {
Document document = new Document();
String suggest = prefixes[i % 3] + TestUtil.randomSimpleString(random(), 10) + "_" +String.valueOf(i);
long weight = Math.abs(random().nextLong());
document.add(newSuggestField("suggest_field", suggest, weight));
mappings.put(suggest, weight);
iw.addDocument(document);
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
for (String prefix : prefixes) {
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", prefix, num);
assertTrue(suggest.totalHits > 0);
float topScore = -1;
for (SuggestScoreDoc scoreDoc : suggest.scoreLookupDocs()) {
if (topScore != -1) {
assertTrue(topScore >= scoreDoc.score);
}
topScore = scoreDoc.score;
assertThat((float) mappings.get(scoreDoc.key.toString()), equalTo(scoreDoc.score));
assertNotNull(mappings.remove(scoreDoc.key.toString()));
}
}
assertThat(mappings.size(), equalTo(0));
reader.close();
iw.close();
}
@Test
public void testRealisticKeys() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field"));
LineFileDocs lineFileDocs = new LineFileDocs(random());
int num = atLeast(100);
Map<String, Long> mappings = new HashMap<>();
for (int i = 0; i < num; i++) {
Document document = lineFileDocs.nextDoc();
String title = document.getField("title").stringValue();
long weight = Math.abs(random().nextLong());
Long prevWeight = mappings.get(title);
if (prevWeight == null || prevWeight < weight) {
mappings.put(title, weight);
}
Document doc = new Document();
doc.add(newSuggestField("suggest_field", title, weight));
iw.addDocument(doc);
if (rarely()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
for (Map.Entry<String, Long> entry : mappings.entrySet()) {
String title = entry.getKey();
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field", title, mappings.size());
assertTrue(suggest.totalHits > 0);
boolean matched = false;
for (ScoreDoc scoreDoc : suggest.scoreDocs) {
matched = Float.compare(scoreDoc.score, (float) entry.getValue()) == 0;
if (matched) {
break;
}
}
assertTrue("at least one of the entries should have the score", matched);
}
reader.close();
iw.close();
}
@Test
public void testThreads() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwcWithSuggestField(analyzer, "suggest_field_1", "suggest_field_2", "suggest_field_3"));
int num = atLeast(100);
final String prefix1 = "abc1_";
final String prefix2 = "abc2_";
final String prefix3 = "abc3_";
final Entry[] entries1 = new Entry[num];
final Entry[] entries2 = new Entry[num];
final Entry[] entries3 = new Entry[num];
for (int i = 0; i < num; i++) {
int weight = num - (i + 1);
entries1[i] = new Entry(prefix1 + weight, weight);
entries2[i] = new Entry(prefix2 + weight, weight);
entries3[i] = new Entry(prefix3 + weight, weight);
}
for (int i = 0; i < num; i++) {
Document doc = new Document();
doc.add(newSuggestField("suggest_field_1", prefix1 + i, i));
doc.add(newSuggestField("suggest_field_2", prefix2 + i, i));
doc.add(newSuggestField("suggest_field_3", prefix3 + i, i));
iw.addDocument(doc);
if (rarely()) {
iw.commit();
}
}
DirectoryReader reader = iw.getReader();
int numThreads = TestUtil.nextInt(random(), 2, 7);
Thread threads[] = new Thread[numThreads];
final CyclicBarrier startingGun = new CyclicBarrier(numThreads+1);
final CopyOnWriteArrayList<Throwable> errors = new CopyOnWriteArrayList<>();
final SuggestIndexSearcher indexSearcher = new SuggestIndexSearcher(reader, analyzer);
for (int i = 0; i < threads.length; i++) {
threads[i] = new Thread() {
@Override
public void run() {
try {
startingGun.await();
TopSuggestDocs suggest = indexSearcher.suggest("suggest_field_1", prefix1, num);
assertSuggestions(suggest, entries1);
suggest = indexSearcher.suggest("suggest_field_2", prefix2, num);
assertSuggestions(suggest, entries2);
suggest = indexSearcher.suggest("suggest_field_3", prefix3, num);
assertSuggestions(suggest, entries3);
} catch (Throwable e) {
errors.add(e);
}
}
};
threads[i].start();
}
startingGun.await();
for (Thread t : threads) {
t.join();
}
assertTrue(errors.toString(), errors.isEmpty());
reader.close();
iw.close();
}
private static Filter randomAccessFilter(Filter filter) {
return new Filter() {
@Override
public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException {
DocIdSet docIdSet = filter.getDocIdSet(context, acceptDocs);
DocIdSetIterator iterator = docIdSet.iterator();
FixedBitSet bits = new FixedBitSet(context.reader().maxDoc());
if (iterator != null) {
int doc;
while((doc = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
bits.set(doc);
}
}
return new DocIdSet() {
@Override
public DocIdSetIterator iterator() throws IOException {
return iterator;
}
@Override
public Bits bits() throws IOException {
return bits;
}
@Override
public long ramBytesUsed() {
return docIdSet.ramBytesUsed();
}
};
}
@Override
public String toString(String field) {
return filter.toString(field);
}
};
}
private static class Entry {
private final String output;
private final float value;
private Entry(String output, float value) {
this.output = output;
this.value = value;
}
}
private void assertSuggestions(TopDocs actual, Entry... expected) {
SuggestScoreDoc[] suggestScoreDocs = (SuggestScoreDoc[]) actual.scoreDocs;
assertThat(suggestScoreDocs.length, equalTo(expected.length));
for (int i = 0; i < suggestScoreDocs.length; i++) {
SuggestScoreDoc lookupDoc = suggestScoreDocs[i];
assertThat(lookupDoc.key.toString(), equalTo(expected[i].output));
assertThat(lookupDoc.score, equalTo(expected[i].value));
}
}
private SuggestField newSuggestField(String name, String value, long weight) throws IOException {
return new SuggestField(name, value, weight);
}
private IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, String... suggestFields) {
return iwcWithSuggestField(analyzer, asSet(suggestFields));
}
private IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, Set<String> suggestFields) {
IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
iwc.setMergePolicy(newLogMergePolicy());
Codec filterCodec = new Lucene50Codec() {
PostingsFormat postingsFormat = new Completion50PostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if (suggestFields.contains(field)) {
return postingsFormat;
}
return super.getPostingsFormatForField(field);
}
};
iwc.setCodec(filterCodec);
return iwc;
}
}