Updated Analyzing/Fuzzysuggester from lucene trunk
* Minor alignments (like setter to ctor) * FuzzySuggester has a unicode aware flag, which is not exposed in the fuzzy completion request parameters * Made XAnalyzingSuggester flags (PAYLOAD_SEP, END_BYTE, SEP_LABEL) to be written into the postings format, so we can retain backwards compatibility * The above change also implies, that these flags can be set per instantiated XAnalyzingSuggester * CompletionPostingsFormatTest now uses a randomProvider for writing data to check for bwc
This commit is contained in:
parent
9f5d01ca4c
commit
bf74f49fdd
|
@ -218,6 +218,11 @@ The following parameters are supported:
|
|||
Minimum length of the input, which is not
|
||||
checked for fuzzy alternatives, defaults to `1`
|
||||
|
||||
`unicode_aware`::
|
||||
Sets all are measurements (like edit distance,
|
||||
transpositions and lengths) in unicode code points
|
||||
(actual letters) instead of bytes.
|
||||
|
||||
NOTE: If you want to stick with the default values, but
|
||||
still use fuzzy, you can either use `fuzzy: {}`
|
||||
or `fuzzy: true`.
|
||||
|
|
|
@ -26,6 +26,8 @@ import org.apache.lucene.search.suggest.InputIterator;
|
|||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.Sort;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.util.automaton.*;
|
||||
import org.apache.lucene.util.fst.*;
|
||||
|
@ -34,10 +36,7 @@ import org.apache.lucene.util.fst.PairOutputs.Pair;
|
|||
import org.apache.lucene.util.fst.Util.MinResult;
|
||||
import org.elasticsearch.common.collect.HppcMaps;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
@ -53,8 +52,9 @@ import java.util.*;
|
|||
* then the partial text "ghost chr..." could see the
|
||||
* suggestion "The Ghost of Christmas Past". Note that
|
||||
* position increments MUST NOT be preserved for this example
|
||||
* to work, so you should call
|
||||
* {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.
|
||||
* to work, so you should call the constructor with
|
||||
* <code>preservePositionIncrements</code> parameter set to
|
||||
* false
|
||||
*
|
||||
* <p>
|
||||
* If SynonymFilter is used to map wifi and wireless network to
|
||||
|
@ -124,24 +124,24 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
private final boolean preserveSep;
|
||||
|
||||
/** Include this flag in the options parameter to {@link
|
||||
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)} to always
|
||||
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)} to always
|
||||
* return the exact match first, regardless of score. This
|
||||
* has no performance impact but could result in
|
||||
* low-quality suggestions. */
|
||||
public static final int EXACT_FIRST = 1;
|
||||
|
||||
/** Include this flag in the options parameter to {@link
|
||||
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)} to preserve
|
||||
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)} to preserve
|
||||
* token separators when matching. */
|
||||
public static final int PRESERVE_SEP = 2;
|
||||
|
||||
/** Represents the separation between tokens, if
|
||||
* PRESERVE_SEP was specified */
|
||||
private static final int SEP_LABEL = 0xFF;
|
||||
public static final int SEP_LABEL = '\u001F';
|
||||
|
||||
/** Marks end of the analyzed input and start of dedup
|
||||
* byte. */
|
||||
private static final int END_BYTE = 0x0;
|
||||
public static final int END_BYTE = 0x0;
|
||||
|
||||
/** Maximum number of dup surface forms (different surface
|
||||
* forms for the same analyzed form). */
|
||||
|
@ -160,27 +160,31 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
|
||||
private boolean hasPayloads;
|
||||
|
||||
private static final int PAYLOAD_SEP = '\u001f';
|
||||
private final int sepLabel;
|
||||
private final int payloadSep;
|
||||
private final int endByte;
|
||||
|
||||
public static final int PAYLOAD_SEP = '\u001f';
|
||||
|
||||
/** Whether position holes should appear in the automaton. */
|
||||
private boolean preservePositionIncrements;
|
||||
|
||||
/**
|
||||
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)
|
||||
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)
|
||||
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
|
||||
* PRESERVE_SEP, 256, -1)}
|
||||
*/
|
||||
public XAnalyzingSuggester(Analyzer analyzer) {
|
||||
this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, null, false, 0);
|
||||
this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)
|
||||
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)
|
||||
* AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
|
||||
* PRESERVE_SEP, 256, -1)}
|
||||
*/
|
||||
public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
|
||||
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, null, false, 0);
|
||||
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -199,8 +203,9 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
* to expand from the analyzed form. Set this to -1 for
|
||||
* no limit.
|
||||
*/
|
||||
public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions
|
||||
, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput) {
|
||||
public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
|
||||
boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
|
||||
int sepLabel, int payloadSep, int endByte) {
|
||||
// SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput
|
||||
this.indexAnalyzer = indexAnalyzer;
|
||||
this.queryAnalyzer = queryAnalyzer;
|
||||
|
@ -226,16 +231,13 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
}
|
||||
this.maxGraphExpansions = maxGraphExpansions;
|
||||
this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
|
||||
this.preservePositionIncrements = true;
|
||||
}
|
||||
|
||||
/** Whether to take position holes (position increment > 1) into account when
|
||||
* building the automaton, <code>true</code> by default. */
|
||||
public void setPreservePositionIncrements(boolean preservePositionIncrements) {
|
||||
this.preservePositionIncrements = preservePositionIncrements;
|
||||
this.sepLabel = sepLabel;
|
||||
this.payloadSep = payloadSep;
|
||||
this.endByte = endByte;
|
||||
}
|
||||
|
||||
/** Returns byte size of the underlying FST. */
|
||||
/** Returns byte size of the underlying FST. */
|
||||
public long sizeInBytes() {
|
||||
return fst == null ? 0 : fst.sizeInBytes();
|
||||
}
|
||||
|
@ -251,7 +253,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
|
||||
// Replaces SEP with epsilon or remaps them if
|
||||
// we were asked to preserve them:
|
||||
private static void replaceSep(Automaton a, boolean preserveSep) {
|
||||
private static void replaceSep(Automaton a, boolean preserveSep, int replaceSep) {
|
||||
|
||||
State[] states = a.getNumberedStates();
|
||||
|
||||
|
@ -265,7 +267,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
|
||||
if (preserveSep) {
|
||||
// Remap to SEP_LABEL:
|
||||
newTransitions.add(new Transition(SEP_LABEL, t.getDest()));
|
||||
newTransitions.add(new Transition(replaceSep, t.getDest()));
|
||||
} else {
|
||||
copyDestTransitions(state, t.getDest(), newTransitions);
|
||||
a.setDeterministic(false);
|
||||
|
@ -289,21 +291,30 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
}
|
||||
}
|
||||
|
||||
protected Automaton convertAutomaton(Automaton a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
/** Just escapes the 0xff byte (which we still for SEP). */
|
||||
private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
|
||||
|
||||
final BytesRef spare = new BytesRef();
|
||||
private char sepLabel;
|
||||
|
||||
public EscapingTokenStreamToAutomaton(char sepLabel) {
|
||||
this.sepLabel = sepLabel;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BytesRef changeToken(BytesRef in) {
|
||||
int upto = 0;
|
||||
for(int i=0;i<in.length;i++) {
|
||||
byte b = in.bytes[in.offset+i];
|
||||
if (b == (byte) SEP_LABEL) {
|
||||
if (b == (byte) sepLabel) {
|
||||
if (spare.bytes.length == upto) {
|
||||
spare.grow(upto+2);
|
||||
}
|
||||
spare.bytes[upto++] = (byte) SEP_LABEL;
|
||||
spare.bytes[upto++] = (byte) sepLabel;
|
||||
spare.bytes[upto++] = b;
|
||||
} else {
|
||||
if (spare.bytes.length == upto) {
|
||||
|
@ -321,7 +332,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
public TokenStreamToAutomaton getTokenStreamToAutomaton() {
|
||||
final TokenStreamToAutomaton tsta;
|
||||
if (preserveSep) {
|
||||
tsta = new EscapingTokenStreamToAutomaton();
|
||||
tsta = new EscapingTokenStreamToAutomaton((char) sepLabel);
|
||||
} else {
|
||||
// When we're not preserving sep, we don't steal 0xff
|
||||
// byte, so we don't need to do any escaping:
|
||||
|
@ -387,7 +398,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
}
|
||||
return scratchA.compareTo(scratchB);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void build(InputIterator iterator) throws IOException {
|
||||
|
@ -454,7 +465,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
|
||||
if (hasPayloads) {
|
||||
for(int i=0;i<surfaceForm.length;i++) {
|
||||
if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
|
||||
if (surfaceForm.bytes[i] == payloadSep) {
|
||||
throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
|
||||
}
|
||||
}
|
||||
|
@ -558,7 +569,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
int payloadLength = scratch.length - payloadOffset;
|
||||
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
|
||||
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
|
||||
br.bytes[surface.length] = PAYLOAD_SEP;
|
||||
br.bytes[surface.length] = (byte) payloadSep;
|
||||
System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
|
||||
br.length = br.bytes.length;
|
||||
builder.add(scratchInts, outputs.newPair(cost, br));
|
||||
|
@ -566,8 +577,10 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
}
|
||||
fst = builder.finish();
|
||||
|
||||
//Util.dotToFile(fst, "/tmp/suggest.dot");
|
||||
|
||||
//PrintWriter pw = new PrintWriter("/tmp/out.dot");
|
||||
//Util.toDot(fst, pw, true, true);
|
||||
//pw.close();
|
||||
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
|
@ -616,7 +629,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
if (hasPayloads) {
|
||||
int sepIndex = -1;
|
||||
for(int i=0;i<output2.length;i++) {
|
||||
if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) {
|
||||
if (output2.bytes[output2.offset+i] == payloadSep) {
|
||||
sepIndex = i;
|
||||
break;
|
||||
}
|
||||
|
@ -649,7 +662,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
return output2.bytes[output2.offset + key.length] == PAYLOAD_SEP;
|
||||
return output2.bytes[output2.offset + key.length] == payloadSep;
|
||||
} else {
|
||||
return key.bytesEquals(output2);
|
||||
}
|
||||
|
@ -667,6 +680,14 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
}
|
||||
|
||||
//System.out.println("lookup key=" + key + " num=" + num);
|
||||
for (int i = 0; i < key.length(); i++) {
|
||||
if (key.charAt(i) == 0x1E) {
|
||||
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
|
||||
}
|
||||
if (key.charAt(i) == 0x1F) {
|
||||
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
|
||||
}
|
||||
}
|
||||
final BytesRef utf8Key = new BytesRef(key);
|
||||
try {
|
||||
|
||||
|
@ -688,13 +709,13 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
|
||||
final List<LookupResult> results = new ArrayList<LookupResult>();
|
||||
|
||||
List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);
|
||||
List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
|
||||
|
||||
if (exactFirst) {
|
||||
|
||||
int count = 0;
|
||||
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
|
||||
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
|
||||
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
|
||||
// This node has END_BYTE arc leaving, meaning it's an
|
||||
// "exact" match:
|
||||
count++;
|
||||
|
@ -712,7 +733,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
// pruned our exact match from one of these nodes
|
||||
// ...:
|
||||
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
|
||||
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
|
||||
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
|
||||
// This node has END_BYTE arc leaving, meaning it's an
|
||||
// "exact" match:
|
||||
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
|
||||
|
@ -820,13 +841,12 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
throws IOException {
|
||||
return prefixPaths;
|
||||
}
|
||||
|
||||
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
|
||||
// Analyze surface form:
|
||||
|
||||
public final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
|
||||
// Analyze surface form:
|
||||
TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());
|
||||
return toFiniteStrings(ts2a, ts);
|
||||
}
|
||||
|
||||
public final Set<IntsRef> toFiniteStrings(final TokenStreamToAutomaton ts2a, TokenStream ts) throws IOException {
|
||||
// Analyze surface form:
|
||||
|
||||
|
@ -836,7 +856,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
Automaton automaton = ts2a.toAutomaton(ts);
|
||||
ts.close();
|
||||
|
||||
replaceSep(automaton, preserveSep);
|
||||
replaceSep(automaton, preserveSep, sepLabel);
|
||||
|
||||
assert SpecialOperations.isFinite(automaton);
|
||||
|
||||
|
@ -862,7 +882,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
// This way we could eg differentiate "net" from "net ",
|
||||
// which we can't today...
|
||||
|
||||
replaceSep(automaton, preserveSep);
|
||||
replaceSep(automaton, preserveSep, sepLabel);
|
||||
|
||||
// TODO: we can optimize this somewhat by determinizing
|
||||
// while we convert
|
||||
|
@ -903,7 +923,6 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
|
||||
public static class XBuilder {
|
||||
private Builder<Pair<Long, BytesRef>> builder;
|
||||
BytesRef previousAnalyzed = null;
|
||||
private int maxSurfaceFormsPerAnalyzedForm;
|
||||
private IntsRef scratchInts = new IntsRef();
|
||||
private final PairOutputs<Long, BytesRef> outputs;
|
||||
|
@ -912,8 +931,10 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
private final SurfaceFormAndPayload[] surfaceFormsAndPayload;
|
||||
private int count;
|
||||
private ObjectIntOpenHashMap<BytesRef> seenSurfaceForms = HppcMaps.Object.Integer.ensureNoNullKeys(256, 0.75f);
|
||||
private int payloadSep;
|
||||
|
||||
public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads) {
|
||||
public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) {
|
||||
this.payloadSep = payloadSep;
|
||||
this.outputs = new PairOutputs<Long, BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
|
||||
this.builder = new Builder<Pair<Long, BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
|
||||
|
@ -983,7 +1004,7 @@ public class XAnalyzingSuggester extends Lookup {
|
|||
int len = surface.length + 1 + payload.length;
|
||||
final BytesRef br = new BytesRef(len);
|
||||
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
|
||||
br.bytes[surface.length] = PAYLOAD_SEP;
|
||||
br.bytes[surface.length] = (byte) payloadSep;
|
||||
System.arraycopy(payload.bytes, payload.offset, br.bytes, surface.length + 1, payload.length);
|
||||
br.length = len;
|
||||
payloadRef = br;
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
package org.apache.lucene.search.suggest.analyzing;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStreamToAutomaton;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.automaton.*;
|
||||
|
@ -48,6 +49,9 @@ import java.util.Set;
|
|||
* #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be
|
||||
* edited. We allow up to 1 (@link
|
||||
* #DEFAULT_MAX_EDITS} edit.
|
||||
* If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits,
|
||||
* minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code
|
||||
* points (actual letters) instead of bytes.*
|
||||
*
|
||||
* <p>
|
||||
* NOTE: This suggester does not boost suggestions that
|
||||
|
@ -60,12 +64,22 @@ import java.util.Set;
|
|||
* like synonyms to keep the complexity of the prefix intersection low for good
|
||||
* lookup performance. At index time, complex analyzers can safely be used.
|
||||
* </p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class XFuzzySuggester extends XAnalyzingSuggester {
|
||||
private final int maxEdits;
|
||||
private final boolean transpositions;
|
||||
private final int nonFuzzyPrefix;
|
||||
private final int minFuzzyLength;
|
||||
private final boolean unicodeAware;
|
||||
|
||||
/**
|
||||
* Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix
|
||||
* parameters in Unicode code points (actual letters)
|
||||
* instead of bytes.
|
||||
*/
|
||||
public static final boolean DEFAULT_UNICODE_AWARE = false;
|
||||
|
||||
/**
|
||||
* The default minimum length of the key passed to {@link
|
||||
|
@ -108,7 +122,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
|
|||
*/
|
||||
public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
|
||||
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,
|
||||
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, null, false, 0);
|
||||
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
|
||||
|
||||
}
|
||||
|
||||
|
@ -133,11 +147,15 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
|
|||
* Levenshtein algorithm.
|
||||
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
|
||||
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
|
||||
* @param sepLabel separation label
|
||||
* @param payloadSep payload separator byte
|
||||
* @param endByte end byte marker byte
|
||||
*/
|
||||
public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
|
||||
int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength,
|
||||
FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput) {
|
||||
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, fst, hasPayloads, maxAnalyzedPathsForOneInput);
|
||||
int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware,
|
||||
FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
|
||||
int sepLabel, int payloadSep, int endByte) {
|
||||
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, true, fst, hasPayloads, maxAnalyzedPathsForOneInput, sepLabel, payloadSep, endByte);
|
||||
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
|
||||
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
||||
}
|
||||
|
@ -152,6 +170,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
|
|||
this.transpositions = transpositions;
|
||||
this.nonFuzzyPrefix = nonFuzzyPrefix;
|
||||
this.minFuzzyLength = minFuzzyLength;
|
||||
this.unicodeAware = unicodeAware;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -170,7 +189,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
|
|||
// "compete") ... in which case I think the wFST needs
|
||||
// to be log weights or something ...
|
||||
|
||||
Automaton levA = toLevenshteinAutomata(lookupAutomaton);
|
||||
Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
|
||||
/*
|
||||
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
||||
w.write(levA.toDot());
|
||||
|
@ -180,6 +199,24 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
|
|||
return FSTUtil.intersectPrefixPaths(levA, fst);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Automaton convertAutomaton(Automaton a) {
|
||||
if (unicodeAware) {
|
||||
Automaton utf8automaton = new UTF32ToUTF8().convert(a);
|
||||
BasicOperations.determinize(utf8automaton);
|
||||
return utf8automaton;
|
||||
} else {
|
||||
return a;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStreamToAutomaton getTokenStreamToAutomaton() {
|
||||
final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
|
||||
tsta.setUnicodeArcs(unicodeAware);
|
||||
return tsta;
|
||||
}
|
||||
|
||||
Automaton toLevenshteinAutomata(Automaton automaton) {
|
||||
final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
|
||||
Automaton subs[] = new Automaton[ref.size()];
|
||||
|
@ -197,7 +234,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
|
|||
// to allow the trailing dedup bytes to be
|
||||
// edited... but then 0 byte is "in general" allowed
|
||||
// on input (but not in UTF8).
|
||||
LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
|
||||
LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
|
||||
Automaton levAutomaton = lev.toAutomaton(maxEdits);
|
||||
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
|
||||
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
|
||||
|
|
|
@ -55,7 +55,8 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
private static final int MAX_GRAPH_EXPANSIONS = -1;
|
||||
|
||||
public static final String CODEC_NAME = "analyzing";
|
||||
public static final int CODEC_VERSION = 1;
|
||||
public static final int CODEC_VERSION_START = 1;
|
||||
public static final int CODEC_VERSION_LATEST = 2;
|
||||
|
||||
private boolean preserveSep;
|
||||
private boolean preservePositionIncrements;
|
||||
|
@ -73,8 +74,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
|
||||
// needs to fixed in the suggester first before it can be supported
|
||||
//options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
|
||||
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, null, false, 1);
|
||||
prototype.setPreservePositionIncrements(preservePositionIncrements);
|
||||
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -84,7 +84,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
|
||||
@Override
|
||||
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
|
||||
CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION);
|
||||
CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
|
||||
return new FieldsConsumer() {
|
||||
private Map<FieldInfo, Long> fieldOffsets = new HashMap<FieldInfo, Long>();
|
||||
|
||||
|
@ -111,7 +111,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
public TermsConsumer addField(final FieldInfo field) throws IOException {
|
||||
|
||||
return new TermsConsumer() {
|
||||
final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads);
|
||||
final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
|
||||
final CompletionPostingsConsumer postingsConsumer = new CompletionPostingsConsumer(AnalyzingCompletionLookupProvider.this, builder);
|
||||
|
||||
@Override
|
||||
|
@ -156,6 +156,9 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
|
||||
options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
|
||||
output.writeVInt(options);
|
||||
output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
|
||||
output.writeVInt(XAnalyzingSuggester.END_BYTE);
|
||||
output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -200,7 +203,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
|
||||
@Override
|
||||
public LookupFactory load(IndexInput input) throws IOException {
|
||||
CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION, CODEC_VERSION);
|
||||
int version = CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION_START, CODEC_VERSION_LATEST);
|
||||
final Map<String, AnalyzingSuggestHolder> lookupMap = new HashMap<String, AnalyzingSuggestHolder>();
|
||||
input.seek(input.length() - 8);
|
||||
long metaPointer = input.readLong();
|
||||
|
@ -225,8 +228,23 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPERATORS) != 0;
|
||||
boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0;
|
||||
boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0;
|
||||
lookupMap.put(entry.getValue(), new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
|
||||
hasPayloads, maxAnalyzedPathsForOneInput, fst));
|
||||
|
||||
// first version did not include these three fields, so fall back to old default (before the analyzingsuggester
|
||||
// was updated in Lucene, so we cannot use the suggester defaults)
|
||||
int sepLabel, payloadSep, endByte;
|
||||
if (version == CODEC_VERSION_START) {
|
||||
sepLabel = 0xFF;
|
||||
payloadSep = '\u001f';
|
||||
endByte = 0x0;
|
||||
} else {
|
||||
sepLabel = input.readVInt();
|
||||
endByte = input.readVInt();
|
||||
payloadSep = input.readVInt();
|
||||
}
|
||||
|
||||
AnalyzingSuggestHolder holder = new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
|
||||
hasPayloads, maxAnalyzedPathsForOneInput, fst, sepLabel, payloadSep, endByte);
|
||||
lookupMap.put(entry.getValue(), holder);
|
||||
}
|
||||
return new LookupFactory() {
|
||||
@Override
|
||||
|
@ -242,17 +260,16 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
suggester = new XFuzzySuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
|
||||
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
|
||||
suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(),
|
||||
suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(),
|
||||
suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), suggestionContext.isFuzzyUnicodeAware(),
|
||||
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
|
||||
analyzingSuggestHolder.maxAnalyzedPathsForOneInput);
|
||||
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte);
|
||||
|
||||
} else {
|
||||
suggester = new XAnalyzingSuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
|
||||
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
|
||||
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
|
||||
analyzingSuggestHolder.maxAnalyzedPathsForOneInput);
|
||||
analyzingSuggestHolder.preservePositionIncrements, analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
|
||||
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte);
|
||||
}
|
||||
suggester.setPreservePositionIncrements(analyzingSuggestHolder.preservePositionIncrements);
|
||||
return suggester;
|
||||
}
|
||||
|
||||
|
@ -280,6 +297,11 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
|
||||
return new CompletionStats(sizeInBytes, completionFields);
|
||||
}
|
||||
|
||||
@Override
|
||||
AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper) {
|
||||
return lookupMap.get(mapper.names().indexName());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -291,9 +313,16 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
final boolean hasPayloads;
|
||||
final int maxAnalyzedPathsForOneInput;
|
||||
final FST<Pair<Long, BytesRef>> fst;
|
||||
final int sepLabel;
|
||||
final int payloadSep;
|
||||
final int endByte;
|
||||
|
||||
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
|
||||
boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst) {
|
||||
this(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, hasPayloads, maxAnalyzedPathsForOneInput, fst, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
|
||||
}
|
||||
|
||||
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst, int sepLabel, int payloadSep, int endByte) {
|
||||
this.preserveSep = preserveSep;
|
||||
this.preservePositionIncrements = preservePositionIncrements;
|
||||
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
|
||||
|
@ -301,8 +330,10 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
|
|||
this.hasPayloads = hasPayloads;
|
||||
this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
|
||||
this.fst = fst;
|
||||
this.sepLabel = sepLabel;
|
||||
this.payloadSep = payloadSep;
|
||||
this.endByte = endByte;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -362,5 +362,6 @@ public class Completion090PostingsFormat extends PostingsFormat {
|
|||
public static abstract class LookupFactory {
|
||||
public abstract Lookup getLookup(FieldMapper<?> mapper, CompletionSuggestionContext suggestionContext);
|
||||
public abstract CompletionStats stats(String ... fields);
|
||||
abstract AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -68,6 +68,8 @@ public class CompletionSuggestParser implements SuggestContextParser {
|
|||
suggestion.setFuzzyMinLength(parser.intValue());
|
||||
} else if ("prefix_length".equals(fuzzyConfigName) || "prefixLength".equals(fuzzyConfigName)) {
|
||||
suggestion.setFuzzyPrefixLength(parser.intValue());
|
||||
} else if ("unicode_aware".equals(fuzzyConfigName) || "unicodeAware".equals(fuzzyConfigName)) {
|
||||
suggestion.setFuzzyUnicodeAware(parser.booleanValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ public class CompletionSuggestionContext extends SuggestionSearchContext.Suggest
|
|||
private int fuzzyMinLength = XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH;
|
||||
private int fuzzyPrefixLength = XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX;
|
||||
private boolean fuzzy = false;
|
||||
private boolean fuzzyUnicodeAware = XFuzzySuggester.DEFAULT_UNICODE_AWARE;
|
||||
|
||||
public CompletionSuggestionContext(Suggester suggester) {
|
||||
super(suggester);
|
||||
|
@ -86,4 +87,12 @@ public class CompletionSuggestionContext extends SuggestionSearchContext.Suggest
|
|||
public boolean isFuzzy() {
|
||||
return fuzzy;
|
||||
}
|
||||
|
||||
public void setFuzzyUnicodeAware(boolean fuzzyUnicodeAware) {
|
||||
this.fuzzyUnicodeAware = fuzzyUnicodeAware;
|
||||
}
|
||||
|
||||
public boolean isFuzzyUnicodeAware() {
|
||||
return fuzzyUnicodeAware;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,6 +38,7 @@ public class CompletionSuggestionFuzzyBuilder extends SuggestBuilder.SuggestionB
|
|||
private boolean fuzzyTranspositions = XFuzzySuggester.DEFAULT_TRANSPOSITIONS;
|
||||
private int fuzzyMinLength = XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH;
|
||||
private int fuzzyPrefixLength = XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX;
|
||||
private boolean unicodeAware = XFuzzySuggester.DEFAULT_UNICODE_AWARE;
|
||||
|
||||
public int getFuzzyEditDistance() {
|
||||
return fuzzyEditDistance;
|
||||
|
@ -75,6 +76,15 @@ public class CompletionSuggestionFuzzyBuilder extends SuggestBuilder.SuggestionB
|
|||
return this;
|
||||
}
|
||||
|
||||
public boolean isUnicodeAware() {
|
||||
return unicodeAware;
|
||||
}
|
||||
|
||||
public CompletionSuggestionFuzzyBuilder setUnicodeAware(boolean unicodeAware) {
|
||||
this.unicodeAware = unicodeAware;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected XContentBuilder innerToXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
|
||||
builder.startObject("fuzzy");
|
||||
|
@ -91,6 +101,9 @@ public class CompletionSuggestionFuzzyBuilder extends SuggestBuilder.SuggestionB
|
|||
if (fuzzyPrefixLength != XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX) {
|
||||
builder.field("prefix_length", fuzzyPrefixLength);
|
||||
}
|
||||
if (unicodeAware != XFuzzySuggester.DEFAULT_UNICODE_AWARE) {
|
||||
builder.field("unicode_aware", unicodeAware);
|
||||
}
|
||||
|
||||
builder.endObject();
|
||||
return builder;
|
||||
|
|
|
@ -566,6 +566,36 @@ public class CompletionSuggestSearchTests extends ElasticsearchIntegrationTest {
|
|||
assertSuggestions(suggestResponse, false, "foo", "Nirvana");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testThatFuzzySuggesterIsUnicodeAware() throws Exception {
|
||||
createIndexAndMapping("simple", "simple", true, true, true);
|
||||
|
||||
client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder()
|
||||
.startObject().startObject(FIELD)
|
||||
.startArray("input").value("ööööö").endArray()
|
||||
.endObject().endObject()
|
||||
).get();
|
||||
|
||||
refresh();
|
||||
|
||||
// suggestion with a character, which needs unicode awareness
|
||||
CompletionSuggestionFuzzyBuilder completionSuggestionBuilder =
|
||||
new CompletionSuggestionFuzzyBuilder("foo").field(FIELD).text("öööи").size(10).setUnicodeAware(true);
|
||||
|
||||
SuggestResponse suggestResponse = client().prepareSuggest(INDEX).addSuggestion(completionSuggestionBuilder).execute().actionGet();
|
||||
assertSuggestions(suggestResponse, false, "foo", "ööööö");
|
||||
|
||||
// removing unicode awareness leads to no result
|
||||
completionSuggestionBuilder.setUnicodeAware(false);
|
||||
suggestResponse = client().prepareSuggest(INDEX).addSuggestion(completionSuggestionBuilder).execute().actionGet();
|
||||
assertSuggestions(suggestResponse, false, "foo");
|
||||
|
||||
// increasing edit distance instead of unicode awareness works again, as this is only a single character
|
||||
completionSuggestionBuilder.setFuzzyEditDistance(2);
|
||||
suggestResponse = client().prepareSuggest(INDEX).addSuggestion(completionSuggestionBuilder).execute().actionGet();
|
||||
assertSuggestions(suggestResponse, false, "foo", "ööööö");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testThatStatsAreWorking() throws Exception {
|
||||
String otherField = "testOtherField";
|
||||
|
@ -650,8 +680,11 @@ public class CompletionSuggestSearchTests extends ElasticsearchIntegrationTest {
|
|||
|
||||
refresh();
|
||||
|
||||
assertSuggestions("f", "Feed the trolls", "Feed trolls");
|
||||
assertSuggestions("fe", "Feed the trolls", "Feed trolls");
|
||||
assertSuggestions("fee", "Feed the trolls", "Feed trolls");
|
||||
assertSuggestions("feed", "Feed the trolls", "Feed trolls");
|
||||
assertSuggestions("feed t", "Feed the trolls", "Feed trolls");
|
||||
assertSuggestions("feed th", "Feed the trolls");
|
||||
assertSuggestions("feed the", "Feed the trolls");
|
||||
// stop word complete, gets ignored on query time, makes it "feed" only
|
||||
assertSuggestions("feed the ", "Feed the trolls", "Feed trolls");
|
||||
|
|
|
@ -0,0 +1,330 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.search.suggest.completion;
|
||||
|
||||
import com.carrotsearch.hppc.ObjectLongOpenHashMap;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.codecs.*;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.search.suggest.Lookup;
|
||||
import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester;
|
||||
import org.apache.lucene.search.suggest.analyzing.XFuzzySuggester;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PairOutputs;
|
||||
import org.apache.lucene.util.fst.PairOutputs.Pair;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.elasticsearch.common.regex.Regex;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.CompletionLookupProvider;
|
||||
import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.LookupFactory;
|
||||
import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
/**
|
||||
* This is an older implementation of the AnalyzingCompletionLookupProvider class
|
||||
* We use this to test for backwards compatibility in our tests, namely
|
||||
* CompletionPostingsFormatTest
|
||||
* This ensures upgrades between versions work smoothly
|
||||
*/
|
||||
public class AnalyzingCompletionLookupProviderV1 extends CompletionLookupProvider {
|
||||
|
||||
// for serialization
|
||||
public static final int SERIALIZE_PRESERVE_SEPERATORS = 1;
|
||||
public static final int SERIALIZE_HAS_PAYLOADS = 2;
|
||||
public static final int SERIALIZE_PRESERVE_POSITION_INCREMENTS = 4;
|
||||
|
||||
private static final int MAX_SURFACE_FORMS_PER_ANALYZED_FORM = 256;
|
||||
private static final int MAX_GRAPH_EXPANSIONS = -1;
|
||||
|
||||
public static final String CODEC_NAME = "analyzing";
|
||||
public static final int CODEC_VERSION = 1;
|
||||
|
||||
private boolean preserveSep;
|
||||
private boolean preservePositionIncrements;
|
||||
private int maxSurfaceFormsPerAnalyzedForm;
|
||||
private int maxGraphExpansions;
|
||||
private boolean hasPayloads;
|
||||
private final XAnalyzingSuggester prototype;
|
||||
|
||||
// important, these are the settings from the old xanalyzingsuggester
|
||||
public static final int SEP_LABEL = 0xFF;
|
||||
public static final int END_BYTE = 0x0;
|
||||
public static final int PAYLOAD_SEP = '\u001f';
|
||||
|
||||
public AnalyzingCompletionLookupProviderV1(boolean preserveSep, boolean exactFirst, boolean preservePositionIncrements, boolean hasPayloads) {
|
||||
this.preserveSep = preserveSep;
|
||||
this.preservePositionIncrements = preservePositionIncrements;
|
||||
this.hasPayloads = hasPayloads;
|
||||
this.maxSurfaceFormsPerAnalyzedForm = MAX_SURFACE_FORMS_PER_ANALYZED_FORM;
|
||||
this.maxGraphExpansions = MAX_GRAPH_EXPANSIONS;
|
||||
int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
|
||||
// needs to fixed in the suggester first before it can be supported
|
||||
//options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
|
||||
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements,
|
||||
null, false, 1, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "analyzing";
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
|
||||
CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION);
|
||||
return new FieldsConsumer() {
|
||||
private Map<FieldInfo, Long> fieldOffsets = new HashMap<FieldInfo, Long>();
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
try { /*
|
||||
* write the offsets per field such that we know where
|
||||
* we need to load the FSTs from
|
||||
*/
|
||||
long pointer = output.getFilePointer();
|
||||
output.writeVInt(fieldOffsets.size());
|
||||
for (Map.Entry<FieldInfo, Long> entry : fieldOffsets.entrySet()) {
|
||||
output.writeString(entry.getKey().name);
|
||||
output.writeVLong(entry.getValue());
|
||||
}
|
||||
output.writeLong(pointer);
|
||||
output.flush();
|
||||
} finally {
|
||||
IOUtils.close(output);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsConsumer addField(final FieldInfo field) throws IOException {
|
||||
|
||||
return new TermsConsumer() {
|
||||
final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads, PAYLOAD_SEP);
|
||||
final CompletionPostingsConsumer postingsConsumer = new CompletionPostingsConsumer(AnalyzingCompletionLookupProviderV1.this, builder);
|
||||
|
||||
@Override
|
||||
public PostingsConsumer startTerm(BytesRef text) throws IOException {
|
||||
builder.startTerm(text);
|
||||
return postingsConsumer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
builder.finishTerm(stats.docFreq); // use doc freq as a fallback
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
|
||||
/*
|
||||
* Here we are done processing the field and we can
|
||||
* buid the FST and write it to disk.
|
||||
*/
|
||||
FST<Pair<Long, BytesRef>> build = builder.build();
|
||||
assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
|
||||
/*
|
||||
* it's possible that the FST is null if we have 2 segments that get merged
|
||||
* and all docs that have a value in this field are deleted. This will cause
|
||||
* a consumer to be created but it doesn't consume any values causing the FSTBuilder
|
||||
* to return null.
|
||||
*/
|
||||
if (build != null) {
|
||||
fieldOffsets.put(field, output.getFilePointer());
|
||||
build.save(output);
|
||||
/* write some more meta-info */
|
||||
output.writeVInt(postingsConsumer.getMaxAnalyzedPathsForOneInput());
|
||||
output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
|
||||
output.writeInt(maxGraphExpansions); // can be negative
|
||||
int options = 0;
|
||||
options |= preserveSep ? SERIALIZE_PRESERVE_SEPERATORS : 0;
|
||||
options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
|
||||
options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
|
||||
output.writeVInt(options);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static final class CompletionPostingsConsumer extends PostingsConsumer {
|
||||
private final SuggestPayload spare = new SuggestPayload();
|
||||
private AnalyzingCompletionLookupProviderV1 analyzingSuggestLookupProvider;
|
||||
private XAnalyzingSuggester.XBuilder builder;
|
||||
private int maxAnalyzedPathsForOneInput = 0;
|
||||
|
||||
public CompletionPostingsConsumer(AnalyzingCompletionLookupProviderV1 analyzingSuggestLookupProvider, XAnalyzingSuggester.XBuilder builder) {
|
||||
this.analyzingSuggestLookupProvider = analyzingSuggestLookupProvider;
|
||||
this.builder = builder;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDoc(int docID, int freq) throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
|
||||
analyzingSuggestLookupProvider.parsePayload(payload, spare);
|
||||
builder.addSurface(spare.surfaceForm, spare.payload, spare.weight);
|
||||
// multi fields have the same surface form so we sum up here
|
||||
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishDoc() throws IOException {
|
||||
}
|
||||
|
||||
public int getMaxAnalyzedPathsForOneInput() {
|
||||
return maxAnalyzedPathsForOneInput;
|
||||
}
|
||||
}
|
||||
|
||||
;
|
||||
|
||||
|
||||
@Override
|
||||
public LookupFactory load(IndexInput input) throws IOException {
|
||||
CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION, CODEC_VERSION);
|
||||
final Map<String, AnalyzingSuggestHolder> lookupMap = new HashMap<String, AnalyzingSuggestHolder>();
|
||||
input.seek(input.length() - 8);
|
||||
long metaPointer = input.readLong();
|
||||
input.seek(metaPointer);
|
||||
int numFields = input.readVInt();
|
||||
|
||||
Map<Long, String> meta = new TreeMap<Long, String>();
|
||||
for (int i = 0; i < numFields; i++) {
|
||||
String name = input.readString();
|
||||
long offset = input.readVLong();
|
||||
meta.put(offset, name);
|
||||
}
|
||||
|
||||
for (Map.Entry<Long, String> entry : meta.entrySet()) {
|
||||
input.seek(entry.getKey());
|
||||
FST<Pair<Long, BytesRef>> fst = new FST<Pair<Long, BytesRef>>(input, new PairOutputs<Long, BytesRef>(
|
||||
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
|
||||
int maxAnalyzedPathsForOneInput = input.readVInt();
|
||||
int maxSurfaceFormsPerAnalyzedForm = input.readVInt();
|
||||
int maxGraphExpansions = input.readInt();
|
||||
int options = input.readVInt();
|
||||
boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPERATORS) != 0;
|
||||
boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0;
|
||||
boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0;
|
||||
lookupMap.put(entry.getValue(), new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
|
||||
hasPayloads, maxAnalyzedPathsForOneInput, fst));
|
||||
}
|
||||
return new LookupFactory() {
|
||||
@Override
|
||||
public Lookup getLookup(FieldMapper<?> mapper, CompletionSuggestionContext suggestionContext) {
|
||||
AnalyzingSuggestHolder analyzingSuggestHolder = lookupMap.get(mapper.names().indexName());
|
||||
if (analyzingSuggestHolder == null) {
|
||||
return null;
|
||||
}
|
||||
int flags = analyzingSuggestHolder.preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
|
||||
|
||||
XAnalyzingSuggester suggester;
|
||||
if (suggestionContext.isFuzzy()) {
|
||||
suggester = new XFuzzySuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
|
||||
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
|
||||
suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(),
|
||||
suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), false,
|
||||
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
|
||||
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
|
||||
|
||||
} else {
|
||||
suggester = new XAnalyzingSuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
|
||||
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
|
||||
analyzingSuggestHolder.preservePositionIncrements,
|
||||
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
|
||||
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
|
||||
}
|
||||
return suggester;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CompletionStats stats(String... fields) {
|
||||
long sizeInBytes = 0;
|
||||
ObjectLongOpenHashMap<String> completionFields = null;
|
||||
if (fields != null && fields.length > 0) {
|
||||
completionFields = new ObjectLongOpenHashMap<String>(fields.length);
|
||||
}
|
||||
|
||||
for (Map.Entry<String, AnalyzingSuggestHolder> entry : lookupMap.entrySet()) {
|
||||
sizeInBytes += entry.getValue().fst.sizeInBytes();
|
||||
if (fields == null || fields.length == 0) {
|
||||
continue;
|
||||
}
|
||||
for (String field : fields) {
|
||||
// support for getting fields by regex as in fielddata
|
||||
if (Regex.simpleMatch(field, entry.getKey())) {
|
||||
long fstSize = entry.getValue().fst.sizeInBytes();
|
||||
completionFields.addTo(field, fstSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new CompletionStats(sizeInBytes, completionFields);
|
||||
}
|
||||
@Override
|
||||
AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper) {
|
||||
return lookupMap.get(mapper.names().indexName());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/*
|
||||
// might be readded when we change the current impl, right now not needed
|
||||
static class AnalyzingSuggestHolder {
|
||||
final boolean preserveSep;
|
||||
final boolean preservePositionIncrements;
|
||||
final int maxSurfaceFormsPerAnalyzedForm;
|
||||
final int maxGraphExpansions;
|
||||
final boolean hasPayloads;
|
||||
final int maxAnalyzedPathsForOneInput;
|
||||
final FST<Pair<Long, BytesRef>> fst;
|
||||
|
||||
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
|
||||
boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst) {
|
||||
this.preserveSep = preserveSep;
|
||||
this.preservePositionIncrements = preservePositionIncrements;
|
||||
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
|
||||
this.maxGraphExpansions = maxGraphExpansions;
|
||||
this.hasPayloads = hasPayloads;
|
||||
this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
|
||||
this.fst = fst;
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
@Override
|
||||
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
|
||||
return prototype.toFiniteStrings(prototype.getTokenStreamToAutomaton(), stream);
|
||||
}
|
||||
}
|
|
@ -17,8 +17,9 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.search.suggest;
|
||||
package org.elasticsearch.search.suggest.completion;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.codecs.*;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -42,10 +43,8 @@ import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
|
|||
import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvider;
|
||||
import org.elasticsearch.index.mapper.FieldMapper.Names;
|
||||
import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
|
||||
import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider;
|
||||
import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat;
|
||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
||||
import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.LookupFactory;
|
||||
import org.elasticsearch.search.suggest.completion.CompletionSuggestionContext;
|
||||
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -56,28 +55,33 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.is;
|
||||
|
||||
public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
|
||||
|
||||
@Test
|
||||
public void testCompletionPostingsFormat() throws IOException {
|
||||
AnalyzingCompletionLookupProvider provider = new AnalyzingCompletionLookupProvider(true, false, true, true);
|
||||
AnalyzingCompletionLookupProviderV1 providerV1 = new AnalyzingCompletionLookupProviderV1(true, false, true, true);
|
||||
AnalyzingCompletionLookupProvider currentProvider = new AnalyzingCompletionLookupProvider(true, false, true, true);
|
||||
List<Completion090PostingsFormat.CompletionLookupProvider> providers = Lists.newArrayList(providerV1, currentProvider);
|
||||
|
||||
Completion090PostingsFormat.CompletionLookupProvider randomProvider = providers.get(getRandom().nextInt(providers.size()));
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexOutput output = dir.createOutput("foo.txt", IOContext.DEFAULT);
|
||||
FieldsConsumer consumer = provider.consumer(output);
|
||||
FieldsConsumer consumer = randomProvider.consumer(output);
|
||||
FieldInfo fieldInfo = new FieldInfo("foo", true, 1, false, true, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
|
||||
DocValuesType.SORTED, DocValuesType.BINARY, new HashMap<String, String>());
|
||||
TermsConsumer addField = consumer.addField(fieldInfo);
|
||||
|
||||
PostingsConsumer postingsConsumer = addField.startTerm(new BytesRef("foofightersgenerator"));
|
||||
postingsConsumer.startDoc(0, 1);
|
||||
postingsConsumer.addPosition(256 - 2, provider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
|
||||
postingsConsumer.addPosition(256 - 2, randomProvider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
|
||||
1);
|
||||
postingsConsumer.finishDoc();
|
||||
addField.finishTerm(new BytesRef("foofightersgenerator"), new TermStats(1, 1));
|
||||
addField.startTerm(new BytesRef("generator"));
|
||||
postingsConsumer.startDoc(0, 1);
|
||||
postingsConsumer.addPosition(256 - 1, provider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
|
||||
postingsConsumer.addPosition(256 - 1, randomProvider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
|
||||
1);
|
||||
postingsConsumer.finishDoc();
|
||||
addField.finishTerm(new BytesRef("generator"), new TermStats(1, 1));
|
||||
|
@ -86,7 +90,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
|
|||
output.close();
|
||||
|
||||
IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT);
|
||||
LookupFactory load = provider.load(input);
|
||||
LookupFactory load = currentProvider.load(input);
|
||||
PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
|
||||
NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
|
||||
Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null));
|
||||
|
@ -96,6 +100,46 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProviderBackwardCompatibilityForVersion1() throws IOException {
|
||||
AnalyzingCompletionLookupProviderV1 providerV1 = new AnalyzingCompletionLookupProviderV1(true, false, true, true);
|
||||
AnalyzingCompletionLookupProvider currentProvider = new AnalyzingCompletionLookupProvider(true, false, true, true);
|
||||
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexOutput output = dir.createOutput("foo.txt", IOContext.DEFAULT);
|
||||
FieldsConsumer consumer = providerV1.consumer(output);
|
||||
FieldInfo fieldInfo = new FieldInfo("foo", true, 1, false, true, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
|
||||
DocValuesType.SORTED, DocValuesType.BINARY, new HashMap<String, String>());
|
||||
TermsConsumer addField = consumer.addField(fieldInfo);
|
||||
|
||||
PostingsConsumer postingsConsumer = addField.startTerm(new BytesRef("foofightersgenerator"));
|
||||
postingsConsumer.startDoc(0, 1);
|
||||
postingsConsumer.addPosition(256 - 2, providerV1.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
|
||||
1);
|
||||
postingsConsumer.finishDoc();
|
||||
addField.finishTerm(new BytesRef("foofightersgenerator"), new TermStats(1, 1));
|
||||
addField.startTerm(new BytesRef("generator"));
|
||||
postingsConsumer.startDoc(0, 1);
|
||||
postingsConsumer.addPosition(256 - 1, providerV1.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
|
||||
1);
|
||||
postingsConsumer.finishDoc();
|
||||
addField.finishTerm(new BytesRef("generator"), new TermStats(1, 1));
|
||||
addField.finish(1, 1, 1);
|
||||
consumer.close();
|
||||
output.close();
|
||||
|
||||
IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT);
|
||||
LookupFactory load = currentProvider.load(input);
|
||||
|
||||
PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
|
||||
NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
|
||||
AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder analyzingSuggestHolder = load.getAnalyzingSuggestHolder(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE));
|
||||
assertThat(analyzingSuggestHolder.sepLabel, is(AnalyzingCompletionLookupProviderV1.SEP_LABEL));
|
||||
assertThat(analyzingSuggestHolder.payloadSep, is(AnalyzingCompletionLookupProviderV1.PAYLOAD_SEP));
|
||||
assertThat(analyzingSuggestHolder.endByte, is(AnalyzingCompletionLookupProviderV1.END_BYTE));
|
||||
dir.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDuellCompletions() throws IOException, NoSuchFieldException, SecurityException, IllegalArgumentException,
|
||||
IllegalAccessException {
|
||||
|
@ -105,8 +149,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
|
|||
final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0;
|
||||
|
||||
XAnalyzingSuggester reference = new XAnalyzingSuggester(new StandardAnalyzer(TEST_VERSION_CURRENT), new StandardAnalyzer(
|
||||
TEST_VERSION_CURRENT), options, 256, -1, null, false, 1);
|
||||
reference.setPreservePositionIncrements(preservePositionIncrements);
|
||||
TEST_VERSION_CURRENT), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
|
||||
LineFileDocs docs = new LineFileDocs(getRandom());
|
||||
int num = atLeast(150);
|
||||
final String[] titles = new String[num];
|
Loading…
Reference in New Issue