LUCENE-4880: Fix MemoryIndex handling of empty string

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1466019 13f79535-47bb-0310-9956-ffa450edef68
2013-04-09 13:28:11 +00:00 · 2013-04-09 13:28:11 +00:00 · 37a6474a8f
parent 75c03eb548
commit 37a6474a8f
3 changed files with 49 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -235,6 +235,9 @@ Bug Fixes
 * LUCENE-4918: Highlighter closes the given IndexReader if QueryScorer
  is used with an external IndexReader. (Simon Willnauer, Sirvan Yahyaei)

+* LUCENE-4880: Fix MemoryIndex to consume empty terms from the tokenstream consistent
+  with IndexWriter. Previously it discarded them.  (Timothy Allison via Robert Muir)
+
 Documentation

 * LUCENE-4841: Added example SimpleSortedSetFacetsExample to show how
--- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@ -429,7 +429,6 @@ public class MemoryIndex {
      
      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
-        if (ref.length == 0) continue; // nothing to do
 //        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
--- a/lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
+++ b/lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
@ -21,15 +21,22 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.HashSet;
 import java.util.Set;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@ -54,6 +61,7 @@ import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
 import org.apache.lucene.search.spans.SpanOrQuery;
@ -249,13 +257,41 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
   * Return a random analyzer (Simple, Stop, Standard) to analyze the terms.
   */
  private Analyzer randomAnalyzer() {
-    switch(random().nextInt(3)) {
+    switch(random().nextInt(4)) {
      case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
      case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+      case 2: return new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader);
+          return new TokenStreamComponents(tokenizer, new CrazyTokenFilter(tokenizer));
+        }
+      };
      default: return new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    }
  }
  
+  // a tokenfilter that makes all terms starting with 't' empty strings
+  static final class CrazyTokenFilter extends TokenFilter {
+    final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    
+    CrazyTokenFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        if (termAtt.length() > 0 && termAtt.buffer()[0] == 't') {
+          termAtt.setLength(0);
+        }
+        return true;
+      } else {
+        return false;
+      }
+    }
+  };
+  
  /**
   * Some terms to be indexed, in addition to random words. 
   * These terms are commonly used in the queries. 
@ -425,4 +461,13 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
    }
    lineFileDocs.close();
  }
+  
+  // LUCENE-4880
+  public void testEmptyString() throws IOException {
+    MemoryIndex memory = new MemoryIndex();
+    memory.addField("foo", new CannedTokenStream(new Token("", 0, 5)));
+    IndexSearcher searcher = memory.createSearcher();
+    TopDocs docs = searcher.search(new TermQuery(new Term("foo", "")), 10);
+    assertEquals(1, docs.totalHits);
+  }
 }