Updated Analyzing/Fuzzysuggester from lucene trunk

* Minor alignments (like setter to ctor)
* FuzzySuggester has a unicode aware flag, which is not exposed in the fuzzy completion request parameters
* Made XAnalyzingSuggester flags (PAYLOAD_SEP, END_BYTE, SEP_LABEL) to be written into the postings format, so we can retain backwards compatibility
* The above change also implies, that these flags can be set per instantiated XAnalyzingSuggester
* CompletionPostingsFormatTest now uses a randomProvider for writing data to check for bwc
This commit is contained in:
Alexander Reelsen 2013-11-25 18:22:34 +01:00
parent 9f5d01ca4c
commit bf74f49fdd
11 changed files with 605 additions and 80 deletions

View File

@ -218,6 +218,11 @@ The following parameters are supported:
Minimum length of the input, which is not
checked for fuzzy alternatives, defaults to `1`
`unicode_aware`::
Sets all are measurements (like edit distance,
transpositions and lengths) in unicode code points
(actual letters) instead of bytes.
NOTE: If you want to stick with the default values, but
still use fuzzy, you can either use `fuzzy: {}`
or `fuzzy: true`.

View File

@ -26,6 +26,8 @@ import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Sort;
import org.apache.lucene.store.*;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.*;
import org.apache.lucene.util.automaton.*;
import org.apache.lucene.util.fst.*;
@ -34,10 +36,7 @@ import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.Util.MinResult;
import org.elasticsearch.common.collect.HppcMaps;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.*;
import java.util.*;
/**
@ -53,8 +52,9 @@ import java.util.*;
* then the partial text "ghost chr..." could see the
* suggestion "The Ghost of Christmas Past". Note that
* position increments MUST NOT be preserved for this example
* to work, so you should call
* {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.
* to work, so you should call the constructor with
* <code>preservePositionIncrements</code> parameter set to
* false
*
* <p>
* If SynonymFilter is used to map wifi and wireless network to
@ -124,24 +124,24 @@ public class XAnalyzingSuggester extends Lookup {
private final boolean preserveSep;
/** Include this flag in the options parameter to {@link
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)} to always
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)} to always
* return the exact match first, regardless of score. This
* has no performance impact but could result in
* low-quality suggestions. */
public static final int EXACT_FIRST = 1;
/** Include this flag in the options parameter to {@link
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)} to preserve
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)} to preserve
* token separators when matching. */
public static final int PRESERVE_SEP = 2;
/** Represents the separation between tokens, if
* PRESERVE_SEP was specified */
private static final int SEP_LABEL = 0xFF;
public static final int SEP_LABEL = '\u001F';
/** Marks end of the analyzed input and start of dedup
* byte. */
private static final int END_BYTE = 0x0;
public static final int END_BYTE = 0x0;
/** Maximum number of dup surface forms (different surface
* forms for the same analyzed form). */
@ -160,27 +160,31 @@ public class XAnalyzingSuggester extends Lookup {
private boolean hasPayloads;
private static final int PAYLOAD_SEP = '\u001f';
private final int sepLabel;
private final int payloadSep;
private final int endByte;
public static final int PAYLOAD_SEP = '\u001f';
/** Whether position holes should appear in the automaton. */
private boolean preservePositionIncrements;
/**
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
* PRESERVE_SEP, 256, -1)}
*/
public XAnalyzingSuggester(Analyzer analyzer) {
this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, null, false, 0);
this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
}
/**
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,FST,boolean,int)
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)
* AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
* PRESERVE_SEP, 256, -1)}
*/
public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, null, false, 0);
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
}
/**
@ -199,8 +203,9 @@ public class XAnalyzingSuggester extends Lookup {
* to expand from the analyzed form. Set this to -1 for
* no limit.
*/
public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions
, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput) {
public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
int sepLabel, int payloadSep, int endByte) {
// SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput
this.indexAnalyzer = indexAnalyzer;
this.queryAnalyzer = queryAnalyzer;
@ -226,13 +231,10 @@ public class XAnalyzingSuggester extends Lookup {
}
this.maxGraphExpansions = maxGraphExpansions;
this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
this.preservePositionIncrements = true;
}
/** Whether to take position holes (position increment > 1) into account when
* building the automaton, <code>true</code> by default. */
public void setPreservePositionIncrements(boolean preservePositionIncrements) {
this.preservePositionIncrements = preservePositionIncrements;
this.sepLabel = sepLabel;
this.payloadSep = payloadSep;
this.endByte = endByte;
}
/** Returns byte size of the underlying FST. */
@ -251,7 +253,7 @@ public class XAnalyzingSuggester extends Lookup {
// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private static void replaceSep(Automaton a, boolean preserveSep) {
private static void replaceSep(Automaton a, boolean preserveSep, int replaceSep) {
State[] states = a.getNumberedStates();
@ -265,7 +267,7 @@ public class XAnalyzingSuggester extends Lookup {
if (t.getMin() == TokenStreamToAutomaton.POS_SEP) {
if (preserveSep) {
// Remap to SEP_LABEL:
newTransitions.add(new Transition(SEP_LABEL, t.getDest()));
newTransitions.add(new Transition(replaceSep, t.getDest()));
} else {
copyDestTransitions(state, t.getDest(), newTransitions);
a.setDeterministic(false);
@ -289,21 +291,30 @@ public class XAnalyzingSuggester extends Lookup {
}
}
protected Automaton convertAutomaton(Automaton a) {
return a;
}
/** Just escapes the 0xff byte (which we still for SEP). */
private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
final BytesRef spare = new BytesRef();
private char sepLabel;
public EscapingTokenStreamToAutomaton(char sepLabel) {
this.sepLabel = sepLabel;
}
@Override
protected BytesRef changeToken(BytesRef in) {
int upto = 0;
for(int i=0;i<in.length;i++) {
byte b = in.bytes[in.offset+i];
if (b == (byte) SEP_LABEL) {
if (b == (byte) sepLabel) {
if (spare.bytes.length == upto) {
spare.grow(upto+2);
}
spare.bytes[upto++] = (byte) SEP_LABEL;
spare.bytes[upto++] = (byte) sepLabel;
spare.bytes[upto++] = b;
} else {
if (spare.bytes.length == upto) {
@ -321,7 +332,7 @@ public class XAnalyzingSuggester extends Lookup {
public TokenStreamToAutomaton getTokenStreamToAutomaton() {
final TokenStreamToAutomaton tsta;
if (preserveSep) {
tsta = new EscapingTokenStreamToAutomaton();
tsta = new EscapingTokenStreamToAutomaton((char) sepLabel);
} else {
// When we're not preserving sep, we don't steal 0xff
// byte, so we don't need to do any escaping:
@ -387,7 +398,7 @@ public class XAnalyzingSuggester extends Lookup {
}
return scratchA.compareTo(scratchB);
}
};
}
@Override
public void build(InputIterator iterator) throws IOException {
@ -454,7 +465,7 @@ public class XAnalyzingSuggester extends Lookup {
if (hasPayloads) {
for(int i=0;i<surfaceForm.length;i++) {
if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
if (surfaceForm.bytes[i] == payloadSep) {
throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
@ -558,7 +569,7 @@ public class XAnalyzingSuggester extends Lookup {
int payloadLength = scratch.length - payloadOffset;
BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = PAYLOAD_SEP;
br.bytes[surface.length] = (byte) payloadSep;
System.arraycopy(scratch.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
br.length = br.bytes.length;
builder.add(scratchInts, outputs.newPair(cost, br));
@ -566,7 +577,9 @@ public class XAnalyzingSuggester extends Lookup {
}
fst = builder.finish();
//Util.dotToFile(fst, "/tmp/suggest.dot");
//PrintWriter pw = new PrintWriter("/tmp/out.dot");
//Util.toDot(fst, pw, true, true);
//pw.close();
success = true;
} finally {
@ -616,7 +629,7 @@ public class XAnalyzingSuggester extends Lookup {
if (hasPayloads) {
int sepIndex = -1;
for(int i=0;i<output2.length;i++) {
if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) {
if (output2.bytes[output2.offset+i] == payloadSep) {
sepIndex = i;
break;
}
@ -649,7 +662,7 @@ public class XAnalyzingSuggester extends Lookup {
return false;
}
}
return output2.bytes[output2.offset + key.length] == PAYLOAD_SEP;
return output2.bytes[output2.offset + key.length] == payloadSep;
} else {
return key.bytesEquals(output2);
}
@ -667,6 +680,14 @@ public class XAnalyzingSuggester extends Lookup {
}
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == 0x1E) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == 0x1F) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
@ -688,13 +709,13 @@ public class XAnalyzingSuggester extends Lookup {
final List<LookupResult> results = new ArrayList<LookupResult>();
List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst);
List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
@ -712,7 +733,7 @@ public class XAnalyzingSuggester extends Lookup {
// pruned our exact match from one of these nodes
// ...:
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
@ -821,12 +842,11 @@ public class XAnalyzingSuggester extends Lookup {
return prefixPaths;
}
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
public final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
// Analyze surface form:
TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());
return toFiniteStrings(ts2a, ts);
}
public final Set<IntsRef> toFiniteStrings(final TokenStreamToAutomaton ts2a, TokenStream ts) throws IOException {
// Analyze surface form:
@ -836,7 +856,7 @@ public class XAnalyzingSuggester extends Lookup {
Automaton automaton = ts2a.toAutomaton(ts);
ts.close();
replaceSep(automaton, preserveSep);
replaceSep(automaton, preserveSep, sepLabel);
assert SpecialOperations.isFinite(automaton);
@ -862,7 +882,7 @@ public class XAnalyzingSuggester extends Lookup {
// This way we could eg differentiate "net" from "net ",
// which we can't today...
replaceSep(automaton, preserveSep);
replaceSep(automaton, preserveSep, sepLabel);
// TODO: we can optimize this somewhat by determinizing
// while we convert
@ -903,7 +923,6 @@ public class XAnalyzingSuggester extends Lookup {
public static class XBuilder {
private Builder<Pair<Long, BytesRef>> builder;
BytesRef previousAnalyzed = null;
private int maxSurfaceFormsPerAnalyzedForm;
private IntsRef scratchInts = new IntsRef();
private final PairOutputs<Long, BytesRef> outputs;
@ -912,8 +931,10 @@ public class XAnalyzingSuggester extends Lookup {
private final SurfaceFormAndPayload[] surfaceFormsAndPayload;
private int count;
private ObjectIntOpenHashMap<BytesRef> seenSurfaceForms = HppcMaps.Object.Integer.ensureNoNullKeys(256, 0.75f);
private int payloadSep;
public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads) {
public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) {
this.payloadSep = payloadSep;
this.outputs = new PairOutputs<Long, BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
this.builder = new Builder<Pair<Long, BytesRef>>(FST.INPUT_TYPE.BYTE1, outputs);
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
@ -983,7 +1004,7 @@ public class XAnalyzingSuggester extends Lookup {
int len = surface.length + 1 + payload.length;
final BytesRef br = new BytesRef(len);
System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
br.bytes[surface.length] = PAYLOAD_SEP;
br.bytes[surface.length] = (byte) payloadSep;
System.arraycopy(payload.bytes, payload.offset, br.bytes, surface.length + 1, payload.length);
br.length = len;
payloadRef = br;

View File

@ -19,6 +19,7 @@
package org.apache.lucene.search.suggest.analyzing;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.*;
@ -48,6 +49,9 @@ import java.util.Set;
* #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be
* edited. We allow up to 1 (@link
* #DEFAULT_MAX_EDITS} edit.
* If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits,
* minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code
* points (actual letters) instead of bytes.*
*
* <p>
* NOTE: This suggester does not boost suggestions that
@ -60,12 +64,22 @@ import java.util.Set;
* like synonyms to keep the complexity of the prefix intersection low for good
* lookup performance. At index time, complex analyzers can safely be used.
* </p>
*
* @lucene.experimental
*/
public final class XFuzzySuggester extends XAnalyzingSuggester {
private final int maxEdits;
private final boolean transpositions;
private final int nonFuzzyPrefix;
private final int minFuzzyLength;
private final boolean unicodeAware;
/**
* Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix
* parameters in Unicode code points (actual letters)
* instead of bytes.
*/
public static final boolean DEFAULT_UNICODE_AWARE = false;
/**
* The default minimum length of the key passed to {@link
@ -108,7 +122,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
*/
public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, null, false, 0);
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
}
@ -133,11 +147,15 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
* Levenshtein algorithm.
* @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
* @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
* @param sepLabel separation label
* @param payloadSep payload separator byte
* @param endByte end byte marker byte
*/
public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength,
FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput) {
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, fst, hasPayloads, maxAnalyzedPathsForOneInput);
int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware,
FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
int sepLabel, int payloadSep, int endByte) {
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, true, fst, hasPayloads, maxAnalyzedPathsForOneInput, sepLabel, payloadSep, endByte);
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
}
@ -152,6 +170,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
this.transpositions = transpositions;
this.nonFuzzyPrefix = nonFuzzyPrefix;
this.minFuzzyLength = minFuzzyLength;
this.unicodeAware = unicodeAware;
}
@Override
@ -170,7 +189,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
// "compete") ... in which case I think the wFST needs
// to be log weights or something ...
Automaton levA = toLevenshteinAutomata(lookupAutomaton);
Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
w.write(levA.toDot());
@ -180,6 +199,24 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
return FSTUtil.intersectPrefixPaths(levA, fst);
}
@Override
protected Automaton convertAutomaton(Automaton a) {
if (unicodeAware) {
Automaton utf8automaton = new UTF32ToUTF8().convert(a);
BasicOperations.determinize(utf8automaton);
return utf8automaton;
} else {
return a;
}
}
@Override
public TokenStreamToAutomaton getTokenStreamToAutomaton() {
final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
tsta.setUnicodeArcs(unicodeAware);
return tsta;
}
Automaton toLevenshteinAutomata(Automaton automaton) {
final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
Automaton subs[] = new Automaton[ref.size()];
@ -197,7 +234,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
Automaton levAutomaton = lev.toAutomaton(maxEdits);
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already

View File

@ -55,7 +55,8 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
private static final int MAX_GRAPH_EXPANSIONS = -1;
public static final String CODEC_NAME = "analyzing";
public static final int CODEC_VERSION = 1;
public static final int CODEC_VERSION_START = 1;
public static final int CODEC_VERSION_LATEST = 2;
private boolean preserveSep;
private boolean preservePositionIncrements;
@ -73,8 +74,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
// needs to fixed in the suggester first before it can be supported
//options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, null, false, 1);
prototype.setPreservePositionIncrements(preservePositionIncrements);
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
}
@Override
@ -84,7 +84,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION);
CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
return new FieldsConsumer() {
private Map<FieldInfo, Long> fieldOffsets = new HashMap<FieldInfo, Long>();
@ -111,7 +111,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
public TermsConsumer addField(final FieldInfo field) throws IOException {
return new TermsConsumer() {
final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads);
final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
final CompletionPostingsConsumer postingsConsumer = new CompletionPostingsConsumer(AnalyzingCompletionLookupProvider.this, builder);
@Override
@ -156,6 +156,9 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
output.writeVInt(options);
output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
output.writeVInt(XAnalyzingSuggester.END_BYTE);
output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
}
}
};
@ -200,7 +203,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
@Override
public LookupFactory load(IndexInput input) throws IOException {
CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION, CODEC_VERSION);
int version = CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION_START, CODEC_VERSION_LATEST);
final Map<String, AnalyzingSuggestHolder> lookupMap = new HashMap<String, AnalyzingSuggestHolder>();
input.seek(input.length() - 8);
long metaPointer = input.readLong();
@ -225,8 +228,23 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPERATORS) != 0;
boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0;
boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0;
lookupMap.put(entry.getValue(), new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
hasPayloads, maxAnalyzedPathsForOneInput, fst));
// first version did not include these three fields, so fall back to old default (before the analyzingsuggester
// was updated in Lucene, so we cannot use the suggester defaults)
int sepLabel, payloadSep, endByte;
if (version == CODEC_VERSION_START) {
sepLabel = 0xFF;
payloadSep = '\u001f';
endByte = 0x0;
} else {
sepLabel = input.readVInt();
endByte = input.readVInt();
payloadSep = input.readVInt();
}
AnalyzingSuggestHolder holder = new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
hasPayloads, maxAnalyzedPathsForOneInput, fst, sepLabel, payloadSep, endByte);
lookupMap.put(entry.getValue(), holder);
}
return new LookupFactory() {
@Override
@ -242,17 +260,16 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
suggester = new XFuzzySuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(),
suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(),
suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), suggestionContext.isFuzzyUnicodeAware(),
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput);
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte);
} else {
suggester = new XAnalyzingSuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput);
analyzingSuggestHolder.preservePositionIncrements, analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte);
}
suggester.setPreservePositionIncrements(analyzingSuggestHolder.preservePositionIncrements);
return suggester;
}
@ -280,6 +297,11 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
return new CompletionStats(sizeInBytes, completionFields);
}
@Override
AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper) {
return lookupMap.get(mapper.names().indexName());
}
};
}
@ -291,9 +313,16 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
final boolean hasPayloads;
final int maxAnalyzedPathsForOneInput;
final FST<Pair<Long, BytesRef>> fst;
final int sepLabel;
final int payloadSep;
final int endByte;
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst) {
this(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, hasPayloads, maxAnalyzedPathsForOneInput, fst, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
}
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst, int sepLabel, int payloadSep, int endByte) {
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
@ -301,8 +330,10 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
this.hasPayloads = hasPayloads;
this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
this.fst = fst;
this.sepLabel = sepLabel;
this.payloadSep = payloadSep;
this.endByte = endByte;
}
}
@Override

View File

@ -362,5 +362,6 @@ public class Completion090PostingsFormat extends PostingsFormat {
public static abstract class LookupFactory {
public abstract Lookup getLookup(FieldMapper<?> mapper, CompletionSuggestionContext suggestionContext);
public abstract CompletionStats stats(String ... fields);
abstract AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper);
}
}

View File

@ -68,6 +68,8 @@ public class CompletionSuggestParser implements SuggestContextParser {
suggestion.setFuzzyMinLength(parser.intValue());
} else if ("prefix_length".equals(fuzzyConfigName) || "prefixLength".equals(fuzzyConfigName)) {
suggestion.setFuzzyPrefixLength(parser.intValue());
} else if ("unicode_aware".equals(fuzzyConfigName) || "unicodeAware".equals(fuzzyConfigName)) {
suggestion.setFuzzyUnicodeAware(parser.booleanValue());
}
}
}

View File

@ -34,6 +34,7 @@ public class CompletionSuggestionContext extends SuggestionSearchContext.Suggest
private int fuzzyMinLength = XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH;
private int fuzzyPrefixLength = XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX;
private boolean fuzzy = false;
private boolean fuzzyUnicodeAware = XFuzzySuggester.DEFAULT_UNICODE_AWARE;
public CompletionSuggestionContext(Suggester suggester) {
super(suggester);
@ -86,4 +87,12 @@ public class CompletionSuggestionContext extends SuggestionSearchContext.Suggest
public boolean isFuzzy() {
return fuzzy;
}
public void setFuzzyUnicodeAware(boolean fuzzyUnicodeAware) {
this.fuzzyUnicodeAware = fuzzyUnicodeAware;
}
public boolean isFuzzyUnicodeAware() {
return fuzzyUnicodeAware;
}
}

View File

@ -38,6 +38,7 @@ public class CompletionSuggestionFuzzyBuilder extends SuggestBuilder.SuggestionB
private boolean fuzzyTranspositions = XFuzzySuggester.DEFAULT_TRANSPOSITIONS;
private int fuzzyMinLength = XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH;
private int fuzzyPrefixLength = XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX;
private boolean unicodeAware = XFuzzySuggester.DEFAULT_UNICODE_AWARE;
public int getFuzzyEditDistance() {
return fuzzyEditDistance;
@ -75,6 +76,15 @@ public class CompletionSuggestionFuzzyBuilder extends SuggestBuilder.SuggestionB
return this;
}
public boolean isUnicodeAware() {
return unicodeAware;
}
public CompletionSuggestionFuzzyBuilder setUnicodeAware(boolean unicodeAware) {
this.unicodeAware = unicodeAware;
return this;
}
@Override
protected XContentBuilder innerToXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
builder.startObject("fuzzy");
@ -91,6 +101,9 @@ public class CompletionSuggestionFuzzyBuilder extends SuggestBuilder.SuggestionB
if (fuzzyPrefixLength != XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX) {
builder.field("prefix_length", fuzzyPrefixLength);
}
if (unicodeAware != XFuzzySuggester.DEFAULT_UNICODE_AWARE) {
builder.field("unicode_aware", unicodeAware);
}
builder.endObject();
return builder;

View File

@ -566,6 +566,36 @@ public class CompletionSuggestSearchTests extends ElasticsearchIntegrationTest {
assertSuggestions(suggestResponse, false, "foo", "Nirvana");
}
@Test
public void testThatFuzzySuggesterIsUnicodeAware() throws Exception {
createIndexAndMapping("simple", "simple", true, true, true);
client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder()
.startObject().startObject(FIELD)
.startArray("input").value("ööööö").endArray()
.endObject().endObject()
).get();
refresh();
// suggestion with a character, which needs unicode awareness
CompletionSuggestionFuzzyBuilder completionSuggestionBuilder =
new CompletionSuggestionFuzzyBuilder("foo").field(FIELD).text("öööи").size(10).setUnicodeAware(true);
SuggestResponse suggestResponse = client().prepareSuggest(INDEX).addSuggestion(completionSuggestionBuilder).execute().actionGet();
assertSuggestions(suggestResponse, false, "foo", "ööööö");
// removing unicode awareness leads to no result
completionSuggestionBuilder.setUnicodeAware(false);
suggestResponse = client().prepareSuggest(INDEX).addSuggestion(completionSuggestionBuilder).execute().actionGet();
assertSuggestions(suggestResponse, false, "foo");
// increasing edit distance instead of unicode awareness works again, as this is only a single character
completionSuggestionBuilder.setFuzzyEditDistance(2);
suggestResponse = client().prepareSuggest(INDEX).addSuggestion(completionSuggestionBuilder).execute().actionGet();
assertSuggestions(suggestResponse, false, "foo", "ööööö");
}
@Test
public void testThatStatsAreWorking() throws Exception {
String otherField = "testOtherField";
@ -650,8 +680,11 @@ public class CompletionSuggestSearchTests extends ElasticsearchIntegrationTest {
refresh();
assertSuggestions("f", "Feed the trolls", "Feed trolls");
assertSuggestions("fe", "Feed the trolls", "Feed trolls");
assertSuggestions("fee", "Feed the trolls", "Feed trolls");
assertSuggestions("feed", "Feed the trolls", "Feed trolls");
assertSuggestions("feed t", "Feed the trolls", "Feed trolls");
assertSuggestions("feed th", "Feed the trolls");
assertSuggestions("feed the", "Feed the trolls");
// stop word complete, gets ignored on query time, makes it "feed" only
assertSuggestions("feed the ", "Feed the trolls", "Feed trolls");

View File

@ -0,0 +1,330 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.suggest.completion;
import com.carrotsearch.hppc.ObjectLongOpenHashMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.codecs.*;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester;
import org.apache.lucene.search.suggest.analyzing.XFuzzySuggester;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.CompletionLookupProvider;
import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.LookupFactory;
import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder;
import java.io.IOException;
import java.util.*;
/**
* This is an older implementation of the AnalyzingCompletionLookupProvider class
* We use this to test for backwards compatibility in our tests, namely
* CompletionPostingsFormatTest
* This ensures upgrades between versions work smoothly
*/
public class AnalyzingCompletionLookupProviderV1 extends CompletionLookupProvider {
// for serialization
public static final int SERIALIZE_PRESERVE_SEPERATORS = 1;
public static final int SERIALIZE_HAS_PAYLOADS = 2;
public static final int SERIALIZE_PRESERVE_POSITION_INCREMENTS = 4;
private static final int MAX_SURFACE_FORMS_PER_ANALYZED_FORM = 256;
private static final int MAX_GRAPH_EXPANSIONS = -1;
public static final String CODEC_NAME = "analyzing";
public static final int CODEC_VERSION = 1;
private boolean preserveSep;
private boolean preservePositionIncrements;
private int maxSurfaceFormsPerAnalyzedForm;
private int maxGraphExpansions;
private boolean hasPayloads;
private final XAnalyzingSuggester prototype;
// important, these are the settings from the old xanalyzingsuggester
public static final int SEP_LABEL = 0xFF;
public static final int END_BYTE = 0x0;
public static final int PAYLOAD_SEP = '\u001f';
public AnalyzingCompletionLookupProviderV1(boolean preserveSep, boolean exactFirst, boolean preservePositionIncrements, boolean hasPayloads) {
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.hasPayloads = hasPayloads;
this.maxSurfaceFormsPerAnalyzedForm = MAX_SURFACE_FORMS_PER_ANALYZED_FORM;
this.maxGraphExpansions = MAX_GRAPH_EXPANSIONS;
int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
// needs to fixed in the suggester first before it can be supported
//options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements,
null, false, 1, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
}
@Override
public String getName() {
return "analyzing";
}
@Override
public FieldsConsumer consumer(final IndexOutput output) throws IOException {
CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION);
return new FieldsConsumer() {
private Map<FieldInfo, Long> fieldOffsets = new HashMap<FieldInfo, Long>();
@Override
public void close() throws IOException {
try { /*
* write the offsets per field such that we know where
* we need to load the FSTs from
*/
long pointer = output.getFilePointer();
output.writeVInt(fieldOffsets.size());
for (Map.Entry<FieldInfo, Long> entry : fieldOffsets.entrySet()) {
output.writeString(entry.getKey().name);
output.writeVLong(entry.getValue());
}
output.writeLong(pointer);
output.flush();
} finally {
IOUtils.close(output);
}
}
@Override
public TermsConsumer addField(final FieldInfo field) throws IOException {
return new TermsConsumer() {
final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads, PAYLOAD_SEP);
final CompletionPostingsConsumer postingsConsumer = new CompletionPostingsConsumer(AnalyzingCompletionLookupProviderV1.this, builder);
@Override
public PostingsConsumer startTerm(BytesRef text) throws IOException {
builder.startTerm(text);
return postingsConsumer;
}
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
builder.finishTerm(stats.docFreq); // use doc freq as a fallback
}
@Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
/*
* Here we are done processing the field and we can
* buid the FST and write it to disk.
*/
FST<Pair<Long, BytesRef>> build = builder.build();
assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
/*
* it's possible that the FST is null if we have 2 segments that get merged
* and all docs that have a value in this field are deleted. This will cause
* a consumer to be created but it doesn't consume any values causing the FSTBuilder
* to return null.
*/
if (build != null) {
fieldOffsets.put(field, output.getFilePointer());
build.save(output);
/* write some more meta-info */
output.writeVInt(postingsConsumer.getMaxAnalyzedPathsForOneInput());
output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
output.writeInt(maxGraphExpansions); // can be negative
int options = 0;
options |= preserveSep ? SERIALIZE_PRESERVE_SEPERATORS : 0;
options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
output.writeVInt(options);
}
}
};
}
};
}
private static final class CompletionPostingsConsumer extends PostingsConsumer {
private final SuggestPayload spare = new SuggestPayload();
private AnalyzingCompletionLookupProviderV1 analyzingSuggestLookupProvider;
private XAnalyzingSuggester.XBuilder builder;
private int maxAnalyzedPathsForOneInput = 0;
public CompletionPostingsConsumer(AnalyzingCompletionLookupProviderV1 analyzingSuggestLookupProvider, XAnalyzingSuggester.XBuilder builder) {
this.analyzingSuggestLookupProvider = analyzingSuggestLookupProvider;
this.builder = builder;
}
@Override
public void startDoc(int docID, int freq) throws IOException {
}
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
analyzingSuggestLookupProvider.parsePayload(payload, spare);
builder.addSurface(spare.surfaceForm, spare.payload, spare.weight);
// multi fields have the same surface form so we sum up here
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
}
@Override
public void finishDoc() throws IOException {
}
public int getMaxAnalyzedPathsForOneInput() {
return maxAnalyzedPathsForOneInput;
}
}
;
@Override
public LookupFactory load(IndexInput input) throws IOException {
CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION, CODEC_VERSION);
final Map<String, AnalyzingSuggestHolder> lookupMap = new HashMap<String, AnalyzingSuggestHolder>();
input.seek(input.length() - 8);
long metaPointer = input.readLong();
input.seek(metaPointer);
int numFields = input.readVInt();
Map<Long, String> meta = new TreeMap<Long, String>();
for (int i = 0; i < numFields; i++) {
String name = input.readString();
long offset = input.readVLong();
meta.put(offset, name);
}
for (Map.Entry<Long, String> entry : meta.entrySet()) {
input.seek(entry.getKey());
FST<Pair<Long, BytesRef>> fst = new FST<Pair<Long, BytesRef>>(input, new PairOutputs<Long, BytesRef>(
PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
int maxAnalyzedPathsForOneInput = input.readVInt();
int maxSurfaceFormsPerAnalyzedForm = input.readVInt();
int maxGraphExpansions = input.readInt();
int options = input.readVInt();
boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPERATORS) != 0;
boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0;
boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0;
lookupMap.put(entry.getValue(), new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
hasPayloads, maxAnalyzedPathsForOneInput, fst));
}
return new LookupFactory() {
@Override
public Lookup getLookup(FieldMapper<?> mapper, CompletionSuggestionContext suggestionContext) {
AnalyzingSuggestHolder analyzingSuggestHolder = lookupMap.get(mapper.names().indexName());
if (analyzingSuggestHolder == null) {
return null;
}
int flags = analyzingSuggestHolder.preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
XAnalyzingSuggester suggester;
if (suggestionContext.isFuzzy()) {
suggester = new XFuzzySuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(),
suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), false,
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
} else {
suggester = new XAnalyzingSuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
analyzingSuggestHolder.preservePositionIncrements,
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
}
return suggester;
}
@Override
public CompletionStats stats(String... fields) {
long sizeInBytes = 0;
ObjectLongOpenHashMap<String> completionFields = null;
if (fields != null && fields.length > 0) {
completionFields = new ObjectLongOpenHashMap<String>(fields.length);
}
for (Map.Entry<String, AnalyzingSuggestHolder> entry : lookupMap.entrySet()) {
sizeInBytes += entry.getValue().fst.sizeInBytes();
if (fields == null || fields.length == 0) {
continue;
}
for (String field : fields) {
// support for getting fields by regex as in fielddata
if (Regex.simpleMatch(field, entry.getKey())) {
long fstSize = entry.getValue().fst.sizeInBytes();
completionFields.addTo(field, fstSize);
}
}
}
return new CompletionStats(sizeInBytes, completionFields);
}
@Override
AnalyzingSuggestHolder getAnalyzingSuggestHolder(FieldMapper<?> mapper) {
return lookupMap.get(mapper.names().indexName());
}
};
}
/*
// might be readded when we change the current impl, right now not needed
static class AnalyzingSuggestHolder {
final boolean preserveSep;
final boolean preservePositionIncrements;
final int maxSurfaceFormsPerAnalyzedForm;
final int maxGraphExpansions;
final boolean hasPayloads;
final int maxAnalyzedPathsForOneInput;
final FST<Pair<Long, BytesRef>> fst;
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst) {
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
this.maxGraphExpansions = maxGraphExpansions;
this.hasPayloads = hasPayloads;
this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
this.fst = fst;
}
}
*/
@Override
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
return prototype.toFiniteStrings(prototype.getTokenStreamToAutomaton(), stream);
}
}

View File

@ -17,8 +17,9 @@
* under the License.
*/
package org.elasticsearch.search.suggest;
package org.elasticsearch.search.suggest.completion;
import com.google.common.collect.Lists;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.*;
import org.apache.lucene.document.Document;
@ -42,10 +43,8 @@ import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvider;
import org.elasticsearch.index.mapper.FieldMapper.Names;
import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider;
import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.LookupFactory;
import org.elasticsearch.search.suggest.completion.CompletionSuggestionContext;
import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Test;
@ -56,28 +55,33 @@ import java.util.HashMap;
import java.util.List;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
@Test
public void testCompletionPostingsFormat() throws IOException {
AnalyzingCompletionLookupProvider provider = new AnalyzingCompletionLookupProvider(true, false, true, true);
AnalyzingCompletionLookupProviderV1 providerV1 = new AnalyzingCompletionLookupProviderV1(true, false, true, true);
AnalyzingCompletionLookupProvider currentProvider = new AnalyzingCompletionLookupProvider(true, false, true, true);
List<Completion090PostingsFormat.CompletionLookupProvider> providers = Lists.newArrayList(providerV1, currentProvider);
Completion090PostingsFormat.CompletionLookupProvider randomProvider = providers.get(getRandom().nextInt(providers.size()));
RAMDirectory dir = new RAMDirectory();
IndexOutput output = dir.createOutput("foo.txt", IOContext.DEFAULT);
FieldsConsumer consumer = provider.consumer(output);
FieldsConsumer consumer = randomProvider.consumer(output);
FieldInfo fieldInfo = new FieldInfo("foo", true, 1, false, true, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
DocValuesType.SORTED, DocValuesType.BINARY, new HashMap<String, String>());
TermsConsumer addField = consumer.addField(fieldInfo);
PostingsConsumer postingsConsumer = addField.startTerm(new BytesRef("foofightersgenerator"));
postingsConsumer.startDoc(0, 1);
postingsConsumer.addPosition(256 - 2, provider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
postingsConsumer.addPosition(256 - 2, randomProvider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
1);
postingsConsumer.finishDoc();
addField.finishTerm(new BytesRef("foofightersgenerator"), new TermStats(1, 1));
addField.startTerm(new BytesRef("generator"));
postingsConsumer.startDoc(0, 1);
postingsConsumer.addPosition(256 - 1, provider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
postingsConsumer.addPosition(256 - 1, randomProvider.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
1);
postingsConsumer.finishDoc();
addField.finishTerm(new BytesRef("generator"), new TermStats(1, 1));
@ -86,7 +90,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
output.close();
IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT);
LookupFactory load = provider.load(input);
LookupFactory load = currentProvider.load(input);
PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null));
@ -96,6 +100,46 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
dir.close();
}
@Test
public void testProviderBackwardCompatibilityForVersion1() throws IOException {
AnalyzingCompletionLookupProviderV1 providerV1 = new AnalyzingCompletionLookupProviderV1(true, false, true, true);
AnalyzingCompletionLookupProvider currentProvider = new AnalyzingCompletionLookupProvider(true, false, true, true);
RAMDirectory dir = new RAMDirectory();
IndexOutput output = dir.createOutput("foo.txt", IOContext.DEFAULT);
FieldsConsumer consumer = providerV1.consumer(output);
FieldInfo fieldInfo = new FieldInfo("foo", true, 1, false, true, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
DocValuesType.SORTED, DocValuesType.BINARY, new HashMap<String, String>());
TermsConsumer addField = consumer.addField(fieldInfo);
PostingsConsumer postingsConsumer = addField.startTerm(new BytesRef("foofightersgenerator"));
postingsConsumer.startDoc(0, 1);
postingsConsumer.addPosition(256 - 2, providerV1.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
1);
postingsConsumer.finishDoc();
addField.finishTerm(new BytesRef("foofightersgenerator"), new TermStats(1, 1));
addField.startTerm(new BytesRef("generator"));
postingsConsumer.startDoc(0, 1);
postingsConsumer.addPosition(256 - 1, providerV1.buildPayload(new BytesRef("Generator - Foo Fighters"), 9, new BytesRef("id:10")), 0,
1);
postingsConsumer.finishDoc();
addField.finishTerm(new BytesRef("generator"), new TermStats(1, 1));
addField.finish(1, 1, 1);
consumer.close();
output.close();
IndexInput input = dir.openInput("foo.txt", IOContext.DEFAULT);
LookupFactory load = currentProvider.load(input);
PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder analyzingSuggestHolder = load.getAnalyzingSuggestHolder(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE));
assertThat(analyzingSuggestHolder.sepLabel, is(AnalyzingCompletionLookupProviderV1.SEP_LABEL));
assertThat(analyzingSuggestHolder.payloadSep, is(AnalyzingCompletionLookupProviderV1.PAYLOAD_SEP));
assertThat(analyzingSuggestHolder.endByte, is(AnalyzingCompletionLookupProviderV1.END_BYTE));
dir.close();
}
@Test
public void testDuellCompletions() throws IOException, NoSuchFieldException, SecurityException, IllegalArgumentException,
IllegalAccessException {
@ -105,8 +149,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0;
XAnalyzingSuggester reference = new XAnalyzingSuggester(new StandardAnalyzer(TEST_VERSION_CURRENT), new StandardAnalyzer(
TEST_VERSION_CURRENT), options, 256, -1, null, false, 1);
reference.setPreservePositionIncrements(preservePositionIncrements);
TEST_VERSION_CURRENT), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
LineFileDocs docs = new LineFileDocs(getRandom());
int num = atLeast(150);
final String[] titles = new String[num];