mirror of https://github.com/apache/lucene.git
AnalyzingSuggester doesn't have to escape 0xff byte unless preserveSep is true
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1397135 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
295e46da1f
commit
5e2f59dc75
|
@ -273,7 +273,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Just escapes the bytes we steal (0xff, 0x0). */
|
/** Just escapes the 0xff byte (which we still for SEP). */
|
||||||
private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
|
private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
|
||||||
|
|
||||||
final BytesRef spare = new BytesRef();
|
final BytesRef spare = new BytesRef();
|
||||||
|
@ -301,6 +301,16 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
return spare;
|
return spare;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private TokenStreamToAutomaton getTokenStreamToAutomaton() {
|
||||||
|
if (preserveSep) {
|
||||||
|
return new EscapingTokenStreamToAutomaton();
|
||||||
|
} else {
|
||||||
|
// When we're not preserving sep, we don't steal 0xff
|
||||||
|
// byte, so we don't need to do any escaping:
|
||||||
|
return new TokenStreamToAutomaton();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void build(TermFreqIterator iterator) throws IOException {
|
public void build(TermFreqIterator iterator) throws IOException {
|
||||||
|
@ -313,8 +323,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
Sort.ByteSequencesReader reader = null;
|
Sort.ByteSequencesReader reader = null;
|
||||||
BytesRef scratch = new BytesRef();
|
BytesRef scratch = new BytesRef();
|
||||||
|
|
||||||
TokenStreamToAutomaton ts2a = new EscapingTokenStreamToAutomaton();
|
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
|
||||||
|
|
||||||
// analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short)
|
// analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short)
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
byte buffer[] = new byte[8];
|
byte buffer[] = new byte[8];
|
||||||
|
@ -489,7 +498,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
// TODO: is there a Reader from a CharSequence?
|
// TODO: is there a Reader from a CharSequence?
|
||||||
// Turn tokenstream into automaton:
|
// Turn tokenstream into automaton:
|
||||||
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
|
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
|
||||||
Automaton automaton = (new EscapingTokenStreamToAutomaton()).toAutomaton(ts);
|
Automaton automaton = getTokenStreamToAutomaton().toAutomaton(ts);
|
||||||
ts.end();
|
ts.end();
|
||||||
ts.close();
|
ts.close();
|
||||||
|
|
||||||
|
|
|
@ -706,63 +706,67 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStolenBytes() throws Exception {
|
public void testStolenBytes() throws Exception {
|
||||||
|
|
||||||
final Analyzer analyzer = new Analyzer() {
|
|
||||||
@Override
|
|
||||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
|
||||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
|
|
||||||
|
|
||||||
// TokenStream stream = new SynonymFilter(tokenizer, map, true);
|
|
||||||
// return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
|
|
||||||
return new TokenStreamComponents(tokenizer) {
|
|
||||||
int tokenStreamCounter = 0;
|
|
||||||
final TokenStream[] tokenStreams = new TokenStream[] {
|
|
||||||
new CannedBinaryTokenStream(new BinaryToken[] {
|
|
||||||
token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})),
|
|
||||||
}),
|
|
||||||
new CannedTokenStream(new Token[] {
|
|
||||||
token("a",1,1),
|
|
||||||
token("a",1,1)
|
|
||||||
}),
|
|
||||||
new CannedTokenStream(new Token[] {
|
|
||||||
token("a",1,1),
|
|
||||||
token("a",1,1)
|
|
||||||
}),
|
|
||||||
new CannedBinaryTokenStream(new BinaryToken[] {
|
|
||||||
token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})),
|
|
||||||
})
|
|
||||||
};
|
|
||||||
|
|
||||||
|
// First time w/ preserveSep, second time without:
|
||||||
|
for(int i=0;i<2;i++) {
|
||||||
|
|
||||||
|
final Analyzer analyzer = new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream getTokenStream() {
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
TokenStream result = tokenStreams[tokenStreamCounter];
|
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
|
||||||
tokenStreamCounter++;
|
|
||||||
return result;
|
// TokenStream stream = new SynonymFilter(tokenizer, map, true);
|
||||||
}
|
// return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
|
||||||
|
return new TokenStreamComponents(tokenizer) {
|
||||||
|
int tokenStreamCounter = 0;
|
||||||
|
final TokenStream[] tokenStreams = new TokenStream[] {
|
||||||
|
new CannedBinaryTokenStream(new BinaryToken[] {
|
||||||
|
token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})),
|
||||||
|
}),
|
||||||
|
new CannedTokenStream(new Token[] {
|
||||||
|
token("a",1,1),
|
||||||
|
token("a",1,1)
|
||||||
|
}),
|
||||||
|
new CannedTokenStream(new Token[] {
|
||||||
|
token("a",1,1),
|
||||||
|
token("a",1,1)
|
||||||
|
}),
|
||||||
|
new CannedBinaryTokenStream(new BinaryToken[] {
|
||||||
|
token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})),
|
||||||
|
})
|
||||||
|
};
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream getTokenStream() {
|
||||||
|
TokenStream result = tokenStreams[tokenStreamCounter];
|
||||||
|
tokenStreamCounter++;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void setReader(final Reader reader) throws IOException {
|
protected void setReader(final Reader reader) throws IOException {
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
TermFreq keys[] = new TermFreq[] {
|
TermFreq keys[] = new TermFreq[] {
|
||||||
new TermFreq("a a", 50),
|
new TermFreq("a a", 50),
|
||||||
new TermFreq("a b", 50),
|
new TermFreq("a b", 50),
|
||||||
};
|
};
|
||||||
|
|
||||||
AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer);
|
AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, analyzer, AnalyzingSuggester.EXACT_FIRST | (i==0 ? AnalyzingSuggester.PRESERVE_SEP : 0), 256, -1);
|
||||||
suggester.build(new TermFreqArrayIterator(keys));
|
suggester.build(new TermFreqArrayIterator(keys));
|
||||||
List<LookupResult> results = suggester.lookup("a a", false, 5);
|
List<LookupResult> results = suggester.lookup("a a", false, 5);
|
||||||
assertEquals(1, results.size());
|
assertEquals(1, results.size());
|
||||||
assertEquals("a b", results.get(0).key);
|
assertEquals("a b", results.get(0).key);
|
||||||
assertEquals(50, results.get(0).value);
|
assertEquals(50, results.get(0).value);
|
||||||
|
|
||||||
results = suggester.lookup("a a", false, 5);
|
results = suggester.lookup("a a", false, 5);
|
||||||
assertEquals(1, results.size());
|
assertEquals(1, results.size());
|
||||||
assertEquals("a a", results.get(0).key);
|
assertEquals("a a", results.get(0).key);
|
||||||
assertEquals(50, results.get(0).value);
|
assertEquals(50, results.get(0).value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
|
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue