Made hole character in XAnalyzingSuggester part of Postingsformat

* Hole charactor now can change with new releases
* Fixed bug where the SEP_LABEL constant was used instead of the sepLabel instance variable
* Replaced if- with switch-statement
This commit is contained in:
Alexander Reelsen 2013-11-27 13:21:53 +01:00
parent 1e85e4dd26
commit 11de330246
5 changed files with 42 additions and 32 deletions

View File

@ -92,7 +92,7 @@ import java.util.*;
* @lucene.experimental
*/
public class XAnalyzingSuggester extends Lookup {
/**
* FST<Weight,Surface>:
* input is the analyzed form, with a null byte between terms
@ -124,14 +124,14 @@ public class XAnalyzingSuggester extends Lookup {
private final boolean preserveSep;
/** Include this flag in the options parameter to {@link
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)} to always
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to always
* return the exact match first, regardless of score. This
* has no performance impact but could result in
* low-quality suggestions. */
public static final int EXACT_FIRST = 1;
/** Include this flag in the options parameter to {@link
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)} to preserve
* #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to preserve
* token separators when matching. */
public static final int PRESERVE_SEP = 2;
@ -163,6 +163,7 @@ public class XAnalyzingSuggester extends Lookup {
private final int sepLabel;
private final int payloadSep;
private final int endByte;
private final int holeCharacter;
public static final int PAYLOAD_SEP = '\u001F';
public static final int HOLE_CHARACTER = '\u001E';
@ -171,21 +172,21 @@ public class XAnalyzingSuggester extends Lookup {
private boolean preservePositionIncrements;
/**
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
* PRESERVE_SEP, 256, -1)}
*/
public XAnalyzingSuggester(Analyzer analyzer) {
this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
this(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
}
/**
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int)
* Calls {@link #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)
* AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
* PRESERVE_SEP, 256, -1)}
*/
public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
}
/**
@ -206,7 +207,7 @@ public class XAnalyzingSuggester extends Lookup {
*/
public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
int sepLabel, int payloadSep, int endByte) {
int sepLabel, int payloadSep, int endByte, int holeCharacter) {
// SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput
this.indexAnalyzer = indexAnalyzer;
this.queryAnalyzer = queryAnalyzer;
@ -236,6 +237,7 @@ public class XAnalyzingSuggester extends Lookup {
this.sepLabel = sepLabel;
this.payloadSep = payloadSep;
this.endByte = endByte;
this.holeCharacter = holeCharacter;
}
/** Returns byte size of the underlying FST. */
@ -682,10 +684,10 @@ public class XAnalyzingSuggester extends Lookup {
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == HOLE_CHARACTER) {
if (key.charAt(i) == holeCharacter) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == SEP_LABEL) {
if (key.charAt(i) == sepLabel) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}

View File

@ -122,7 +122,7 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
*/
public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
}
@ -154,8 +154,8 @@ public final class XFuzzySuggester extends XAnalyzingSuggester {
public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware,
FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
int sepLabel, int payloadSep, int endByte) {
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, true, fst, hasPayloads, maxAnalyzedPathsForOneInput, sepLabel, payloadSep, endByte);
int sepLabel, int payloadSep, int endByte, int holeCharacter) {
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, true, fst, hasPayloads, maxAnalyzedPathsForOneInput, sepLabel, payloadSep, endByte, holeCharacter);
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
}

View File

@ -74,7 +74,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
// needs to fixed in the suggester first before it can be supported
//options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
}
@Override
@ -231,19 +231,23 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
// first version did not include these three fields, so fall back to old default (before the analyzingsuggester
// was updated in Lucene, so we cannot use the suggester defaults)
int sepLabel, payloadSep, endByte;
if (version == CODEC_VERSION_START) {
sepLabel = 0xFF;
payloadSep = '\u001f';
endByte = 0x0;
} else {
sepLabel = input.readVInt();
endByte = input.readVInt();
payloadSep = input.readVInt();
int sepLabel, payloadSep, endByte, holeCharacter;
switch (version) {
case CODEC_VERSION_START:
sepLabel = 0xFF;
payloadSep = '\u001f';
endByte = 0x0;
holeCharacter = '\u001E';
break;
default:
sepLabel = input.readVInt();
endByte = input.readVInt();
payloadSep = input.readVInt();
holeCharacter = input.readVInt();
}
AnalyzingSuggestHolder holder = new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
hasPayloads, maxAnalyzedPathsForOneInput, fst, sepLabel, payloadSep, endByte);
hasPayloads, maxAnalyzedPathsForOneInput, fst, sepLabel, payloadSep, endByte, holeCharacter);
lookupMap.put(entry.getValue(), holder);
}
return new LookupFactory() {
@ -262,13 +266,15 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(),
suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), suggestionContext.isFuzzyUnicodeAware(),
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte);
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte,
analyzingSuggestHolder.holeCharacter);
} else {
suggester = new XAnalyzingSuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
analyzingSuggestHolder.preservePositionIncrements, analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte);
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte,
analyzingSuggestHolder.holeCharacter);
}
return suggester;
}
@ -316,13 +322,14 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
final int sepLabel;
final int payloadSep;
final int endByte;
final int holeCharacter;
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst) {
this(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, hasPayloads, maxAnalyzedPathsForOneInput, fst, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
this(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, hasPayloads, maxAnalyzedPathsForOneInput, fst, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
}
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst, int sepLabel, int payloadSep, int endByte) {
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst, int sepLabel, int payloadSep, int endByte, int holeCharacter) {
this.preserveSep = preserveSep;
this.preservePositionIncrements = preservePositionIncrements;
this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
@ -333,6 +340,7 @@ public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider
this.sepLabel = sepLabel;
this.payloadSep = payloadSep;
this.endByte = endByte;
this.holeCharacter = holeCharacter;
}
}

View File

@ -85,7 +85,7 @@ public class AnalyzingCompletionLookupProviderV1 extends CompletionLookupProvide
// needs to fixed in the suggester first before it can be supported
//options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
prototype = new XAnalyzingSuggester(null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements,
null, false, 1, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
null, false, 1, SEP_LABEL, PAYLOAD_SEP, END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
}
@Override
@ -255,14 +255,14 @@ public class AnalyzingCompletionLookupProviderV1 extends CompletionLookupProvide
suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(),
suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), false,
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, SEP_LABEL, PAYLOAD_SEP, END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
} else {
suggester = new XAnalyzingSuggester(mapper.indexAnalyzer(), mapper.searchAnalyzer(), flags,
analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
analyzingSuggestHolder.preservePositionIncrements,
analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, SEP_LABEL, PAYLOAD_SEP, END_BYTE);
analyzingSuggestHolder.maxAnalyzedPathsForOneInput, SEP_LABEL, PAYLOAD_SEP, END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
}
return suggester;
}

View File

@ -123,7 +123,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0;
XAnalyzingSuggester reference = new XAnalyzingSuggester(new StandardAnalyzer(TEST_VERSION_CURRENT), new StandardAnalyzer(
TEST_VERSION_CURRENT), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE);
TEST_VERSION_CURRENT), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
LineFileDocs docs = new LineFileDocs(getRandom());
int num = atLeast(150);
final String[] titles = new String[num];