mirror of https://github.com/apache/lucene.git
LUCENE-4880: Fix MemoryIndex handling of empty string
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1466019 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
75c03eb548
commit
37a6474a8f
|
@ -235,6 +235,9 @@ Bug Fixes
|
||||||
* LUCENE-4918: Highlighter closes the given IndexReader if QueryScorer
|
* LUCENE-4918: Highlighter closes the given IndexReader if QueryScorer
|
||||||
is used with an external IndexReader. (Simon Willnauer, Sirvan Yahyaei)
|
is used with an external IndexReader. (Simon Willnauer, Sirvan Yahyaei)
|
||||||
|
|
||||||
|
* LUCENE-4880: Fix MemoryIndex to consume empty terms from the tokenstream consistent
|
||||||
|
with IndexWriter. Previously it discarded them. (Timothy Allison via Robert Muir)
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
||||||
* LUCENE-4841: Added example SimpleSortedSetFacetsExample to show how
|
* LUCENE-4841: Added example SimpleSortedSetFacetsExample to show how
|
||||||
|
|
|
@ -429,7 +429,6 @@ public class MemoryIndex {
|
||||||
|
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
termAtt.fillBytesRef();
|
termAtt.fillBytesRef();
|
||||||
if (ref.length == 0) continue; // nothing to do
|
|
||||||
// if (DEBUG) System.err.println("token='" + term + "'");
|
// if (DEBUG) System.err.println("token='" + term + "'");
|
||||||
numTokens++;
|
numTokens++;
|
||||||
final int posIncr = posIncrAttribute.getPositionIncrement();
|
final int posIncr = posIncrAttribute.getPositionIncrement();
|
||||||
|
|
|
@ -21,15 +21,22 @@ import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.CannedTokenStream;
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenFilter;
|
import org.apache.lucene.analysis.MockTokenFilter;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
|
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
@ -54,6 +61,7 @@ import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.PhraseQuery;
|
import org.apache.lucene.search.PhraseQuery;
|
||||||
import org.apache.lucene.search.RegexpQuery;
|
import org.apache.lucene.search.RegexpQuery;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.TopDocs;
|
import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
|
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
|
||||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||||
|
@ -249,13 +257,41 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
|
||||||
* Return a random analyzer (Simple, Stop, Standard) to analyze the terms.
|
* Return a random analyzer (Simple, Stop, Standard) to analyze the terms.
|
||||||
*/
|
*/
|
||||||
private Analyzer randomAnalyzer() {
|
private Analyzer randomAnalyzer() {
|
||||||
switch(random().nextInt(3)) {
|
switch(random().nextInt(4)) {
|
||||||
case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
|
||||||
case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
||||||
|
case 2: return new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(reader);
|
||||||
|
return new TokenStreamComponents(tokenizer, new CrazyTokenFilter(tokenizer));
|
||||||
|
}
|
||||||
|
};
|
||||||
default: return new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
default: return new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// a tokenfilter that makes all terms starting with 't' empty strings
|
||||||
|
static final class CrazyTokenFilter extends TokenFilter {
|
||||||
|
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
|
CrazyTokenFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (termAtt.length() > 0 && termAtt.buffer()[0] == 't') {
|
||||||
|
termAtt.setLength(0);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Some terms to be indexed, in addition to random words.
|
* Some terms to be indexed, in addition to random words.
|
||||||
* These terms are commonly used in the queries.
|
* These terms are commonly used in the queries.
|
||||||
|
@ -425,4 +461,13 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
lineFileDocs.close();
|
lineFileDocs.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LUCENE-4880
|
||||||
|
public void testEmptyString() throws IOException {
|
||||||
|
MemoryIndex memory = new MemoryIndex();
|
||||||
|
memory.addField("foo", new CannedTokenStream(new Token("", 0, 5)));
|
||||||
|
IndexSearcher searcher = memory.createSearcher();
|
||||||
|
TopDocs docs = searcher.search(new TermQuery(new Term("foo", "")), 10);
|
||||||
|
assertEquals(1, docs.totalHits);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue