LUCENE-4880: Fix MemoryIndex handling of empty string

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1466019 13f79535-47bb-0310-9956-ffa450edef68
2013-04-09 13:28:11 +00:00 · 2013-04-09 13:28:11 +00:00 · 37a6474a8f
parent 75c03eb548
commit 37a6474a8f
3 changed files with 49 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -235,6 +235,9 @@ Bug Fixes
 * LUCENE-4918: Highlighter closes the given IndexReader if QueryScorer
  is used with an external IndexReader. (Simon Willnauer, Sirvan Yahyaei)
 * LUCENE-4880: Fix MemoryIndex to consume empty terms from the tokenstream consistent
  with IndexWriter. Previously it discarded them.  (Timothy Allison via Robert Muir)
 Documentation
 * LUCENE-4841: Added example SimpleSortedSetFacetsExample to show how
--- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@ -429,7 +429,6 @@ public class MemoryIndex {
      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
        if (ref.length == 0) continue; // nothing to do
 //        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
--- a/lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
+++ b/lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
@ -21,15 +21,22 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.HashSet;
 import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@ -54,6 +61,7 @@ import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.RegexpQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
 import org.apache.lucene.search.spans.SpanOrQuery;
@ -249,13 +257,41 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
   * Return a random analyzer (Simple, Stop, Standard) to analyze the terms.
   */
  private Analyzer randomAnalyzer() {
-    switch(random().nextInt(3)) {
+    switch(random().nextInt(4)) {
      case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
      case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
      case 2: return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
          Tokenizer tokenizer = new MockTokenizer(reader);
          return new TokenStreamComponents(tokenizer, new CrazyTokenFilter(tokenizer));
        }
      };
      default: return new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    }
  }
  // a tokenfilter that makes all terms starting with 't' empty strings
  static final class CrazyTokenFilter extends TokenFilter {
    final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    CrazyTokenFilter(TokenStream input) {
      super(input);
    }
    @Override
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        if (termAtt.length() > 0 && termAtt.buffer()[0] == 't') {
          termAtt.setLength(0);
        }
        return true;
      } else {
        return false;
      }
    }
  };
  /**
   * Some terms to be indexed, in addition to random words. 
   * These terms are commonly used in the queries. 
@ -425,4 +461,13 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
    }
    lineFileDocs.close();
  }
  // LUCENE-4880
  public void testEmptyString() throws IOException {
    MemoryIndex memory = new MemoryIndex();
    memory.addField("foo", new CannedTokenStream(new Token("", 0, 5)));
    IndexSearcher searcher = memory.createSearcher();
    TopDocs docs = searcher.search(new TermQuery(new Term("foo", "")), 10);
    assertEquals(1, docs.totalHits);
  }
 }