AnalyzingSuggester doesn't have to escape 0xff byte unless preserveSep is true

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1397135 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-10-11 16:07:27 +00:00
parent 295e46da1f
commit 5e2f59dc75
2 changed files with 66 additions and 53 deletions

View File

@ -273,7 +273,7 @@ public class AnalyzingSuggester extends Lookup {
} }
} }
/** Just escapes the bytes we steal (0xff, 0x0). */ /** Just escapes the 0xff byte (which we still for SEP). */
private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
final BytesRef spare = new BytesRef(); final BytesRef spare = new BytesRef();
@ -301,6 +301,16 @@ public class AnalyzingSuggester extends Lookup {
return spare; return spare;
} }
} }
private TokenStreamToAutomaton getTokenStreamToAutomaton() {
if (preserveSep) {
return new EscapingTokenStreamToAutomaton();
} else {
// When we're not preserving sep, we don't steal 0xff
// byte, so we don't need to do any escaping:
return new TokenStreamToAutomaton();
}
}
@Override @Override
public void build(TermFreqIterator iterator) throws IOException { public void build(TermFreqIterator iterator) throws IOException {
@ -313,8 +323,7 @@ public class AnalyzingSuggester extends Lookup {
Sort.ByteSequencesReader reader = null; Sort.ByteSequencesReader reader = null;
BytesRef scratch = new BytesRef(); BytesRef scratch = new BytesRef();
TokenStreamToAutomaton ts2a = new EscapingTokenStreamToAutomaton(); TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
// analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short) // analyzed sequence + 0(byte) + weight(int) + surface + analyzedLength(short)
boolean success = false; boolean success = false;
byte buffer[] = new byte[8]; byte buffer[] = new byte[8];
@ -489,7 +498,7 @@ public class AnalyzingSuggester extends Lookup {
// TODO: is there a Reader from a CharSequence? // TODO: is there a Reader from a CharSequence?
// Turn tokenstream into automaton: // Turn tokenstream into automaton:
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
Automaton automaton = (new EscapingTokenStreamToAutomaton()).toAutomaton(ts); Automaton automaton = getTokenStreamToAutomaton().toAutomaton(ts);
ts.end(); ts.end();
ts.close(); ts.close();

View File

@ -706,63 +706,67 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
} }
public void testStolenBytes() throws Exception { public void testStolenBytes() throws Exception {
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
// TokenStream stream = new SynonymFilter(tokenizer, map, true);
// return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
return new TokenStreamComponents(tokenizer) {
int tokenStreamCounter = 0;
final TokenStream[] tokenStreams = new TokenStream[] {
new CannedBinaryTokenStream(new BinaryToken[] {
token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})),
}),
new CannedTokenStream(new Token[] {
token("a",1,1),
token("a",1,1)
}),
new CannedTokenStream(new Token[] {
token("a",1,1),
token("a",1,1)
}),
new CannedBinaryTokenStream(new BinaryToken[] {
token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})),
})
};
// First time w/ preserveSep, second time without:
for(int i=0;i<2;i++) {
final Analyzer analyzer = new Analyzer() {
@Override @Override
public TokenStream getTokenStream() { protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
TokenStream result = tokenStreams[tokenStreamCounter]; Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
tokenStreamCounter++;
return result; // TokenStream stream = new SynonymFilter(tokenizer, map, true);
} // return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
return new TokenStreamComponents(tokenizer) {
int tokenStreamCounter = 0;
final TokenStream[] tokenStreams = new TokenStream[] {
new CannedBinaryTokenStream(new BinaryToken[] {
token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})),
}),
new CannedTokenStream(new Token[] {
token("a",1,1),
token("a",1,1)
}),
new CannedTokenStream(new Token[] {
token("a",1,1),
token("a",1,1)
}),
new CannedBinaryTokenStream(new BinaryToken[] {
token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})),
})
};
@Override
public TokenStream getTokenStream() {
TokenStream result = tokenStreams[tokenStreamCounter];
tokenStreamCounter++;
return result;
}
@Override @Override
protected void setReader(final Reader reader) throws IOException { protected void setReader(final Reader reader) throws IOException {
}
};
} }
}; };
}
};
TermFreq keys[] = new TermFreq[] { TermFreq keys[] = new TermFreq[] {
new TermFreq("a a", 50), new TermFreq("a a", 50),
new TermFreq("a b", 50), new TermFreq("a b", 50),
}; };
AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer); AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, analyzer, AnalyzingSuggester.EXACT_FIRST | (i==0 ? AnalyzingSuggester.PRESERVE_SEP : 0), 256, -1);
suggester.build(new TermFreqArrayIterator(keys)); suggester.build(new TermFreqArrayIterator(keys));
List<LookupResult> results = suggester.lookup("a a", false, 5); List<LookupResult> results = suggester.lookup("a a", false, 5);
assertEquals(1, results.size()); assertEquals(1, results.size());
assertEquals("a b", results.get(0).key); assertEquals("a b", results.get(0).key);
assertEquals(50, results.get(0).value); assertEquals(50, results.get(0).value);
results = suggester.lookup("a a", false, 5); results = suggester.lookup("a a", false, 5);
assertEquals(1, results.size()); assertEquals(1, results.size());
assertEquals("a a", results.get(0).key); assertEquals("a a", results.get(0).key);
assertEquals(50, results.get(0).value); assertEquals(50, results.get(0).value);
}
} }
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {