LUCENE-4880: Fix MemoryIndex handling of empty string

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1466019 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-04-09 13:28:11 +00:00
parent 75c03eb548
commit 37a6474a8f
3 changed files with 49 additions and 2 deletions

View File

@ -235,6 +235,9 @@ Bug Fixes
* LUCENE-4918: Highlighter closes the given IndexReader if QueryScorer * LUCENE-4918: Highlighter closes the given IndexReader if QueryScorer
is used with an external IndexReader. (Simon Willnauer, Sirvan Yahyaei) is used with an external IndexReader. (Simon Willnauer, Sirvan Yahyaei)
* LUCENE-4880: Fix MemoryIndex to consume empty terms from the tokenstream consistent
with IndexWriter. Previously it discarded them. (Timothy Allison via Robert Muir)
Documentation Documentation
* LUCENE-4841: Added example SimpleSortedSetFacetsExample to show how * LUCENE-4841: Added example SimpleSortedSetFacetsExample to show how

View File

@ -429,7 +429,6 @@ public class MemoryIndex {
while (stream.incrementToken()) { while (stream.incrementToken()) {
termAtt.fillBytesRef(); termAtt.fillBytesRef();
if (ref.length == 0) continue; // nothing to do
// if (DEBUG) System.err.println("token='" + term + "'"); // if (DEBUG) System.err.println("token='" + term + "'");
numTokens++; numTokens++;
final int posIncr = posIncrAttribute.getPositionIncrement(); final int posIncr = posIncrAttribute.getPositionIncrement();

View File

@ -21,15 +21,22 @@ import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
@ -54,6 +61,7 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.RegexpQuery; import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanOrQuery;
@ -249,13 +257,41 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
* Return a random analyzer (Simple, Stop, Standard) to analyze the terms. * Return a random analyzer (Simple, Stop, Standard) to analyze the terms.
*/ */
private Analyzer randomAnalyzer() { private Analyzer randomAnalyzer() {
switch(random().nextInt(3)) { switch(random().nextInt(4)) {
case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
case 2: return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader);
return new TokenStreamComponents(tokenizer, new CrazyTokenFilter(tokenizer));
}
};
default: return new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); default: return new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
} }
} }
// a tokenfilter that makes all terms starting with 't' empty strings
static final class CrazyTokenFilter extends TokenFilter {
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
CrazyTokenFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (termAtt.length() > 0 && termAtt.buffer()[0] == 't') {
termAtt.setLength(0);
}
return true;
} else {
return false;
}
}
};
/** /**
* Some terms to be indexed, in addition to random words. * Some terms to be indexed, in addition to random words.
* These terms are commonly used in the queries. * These terms are commonly used in the queries.
@ -425,4 +461,13 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
} }
lineFileDocs.close(); lineFileDocs.close();
} }
// LUCENE-4880
public void testEmptyString() throws IOException {
MemoryIndex memory = new MemoryIndex();
memory.addField("foo", new CannedTokenStream(new Token("", 0, 5)));
IndexSearcher searcher = memory.createSearcher();
TopDocs docs = searcher.search(new TermQuery(new Term("foo", "")), 10);
assertEquals(1, docs.totalHits);
}
} }