make sure max length term works

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1081829 13f79535-47bb-0310-9956-ffa450edef68
2011-03-15 16:11:21 +00:00 · 2011-03-15 16:11:21 +00:00 · 987a9ef84c
parent ac3ef52bd0
commit 987a9ef84c
1 changed files with 134 additions and 11 deletions
--- a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
@ -21,53 +21,57 @@ import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.PrintStream;
+import java.io.Reader;
 import java.io.StringReader;
-import java.util.List;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Set;
-import java.util.HashSet;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
 import java.util.Random;
-import java.util.Collections;
+import java.util.Set;
 import java.util.concurrent.atomic.AtomicBoolean;

-import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.document.Field.Index;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.FieldCache;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.Lock;
 import org.apache.lucene.store.LockFactory;
-import org.apache.lucene.store.NoLockFactory;
 import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.store.NoLockFactory;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.store.SingleInstanceLockFactory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.ThreadInterruptedException;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
-import org.apache.lucene.util.ThreadInterruptedException;
-import org.apache.lucene.util.BytesRef;

 public class TestIndexWriter extends LuceneTestCase {

@ -2912,4 +2916,123 @@ public class TestIndexWriter extends LuceneTestCase {
    w.close();
    dir.close();
  }
+
+  private static class StringSplitAnalyzer extends Analyzer {
+    @Override
+    public TokenStream tokenStream(String fieldName, Reader reader) {
+      return new StringSplitTokenizer(reader);
+    }
+  }
+
+  private static class StringSplitTokenizer extends Tokenizer {
+    private final String[] tokens;
+    private int upto = 0;
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+    public StringSplitTokenizer(Reader r) {
+      try {
+        final StringBuilder b = new StringBuilder();
+        final char[] buffer = new char[1024];
+        int n;
+        while((n = r.read(buffer)) != -1) {
+          b.append(buffer, 0, n);
+        }
+        tokens = b.toString().split(" ");
+      } catch (IOException ioe) {
+        throw new RuntimeException(ioe);
+      }
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+      clearAttributes();      
+      if (upto < tokens.length) {
+        termAtt.setEmpty();
+        termAtt.append(tokens[upto]);
+        upto++;
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  /**
+   * Make sure we skip wicked long terms.
+   */
+  public void testWickedLongTerm() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random, dir, new StringSplitAnalyzer());
+
+    char[] chars = new char[DocumentsWriter.MAX_TERM_LENGTH_UTF8];
+    Arrays.fill(chars, 'x');
+    Document doc = new Document();
+    final String bigTerm = new String(chars);
+    final BytesRef bigTermBytesRef = new BytesRef(bigTerm);
+
+    // This contents produces a too-long term:
+    String contents = "abc xyz x" + bigTerm + " another term";
+    doc.add(new Field("content", contents, Field.Store.NO, Field.Index.ANALYZED));
+    w.addDocument(doc);
+
+    // Make sure we can add another normal document
+    doc = new Document();
+    doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.ANALYZED));
+    w.addDocument(doc);
+
+    IndexReader reader = w.getReader();
+    w.close();
+
+    // Make sure all terms < max size were indexed
+    assertEquals(2, reader.docFreq(new Term("content", "abc")));
+    assertEquals(1, reader.docFreq(new Term("content", "bbb")));
+    assertEquals(1, reader.docFreq(new Term("content", "term")));
+    assertEquals(1, reader.docFreq(new Term("content", "another")));
+
+    // Make sure position is still incremented when
+    // massive term is skipped:
+    DocsAndPositionsEnum tps = MultiFields.getTermPositionsEnum(reader, null, "content", new BytesRef("another"));
+    assertEquals(0, tps.nextDoc());
+    assertEquals(1, tps.freq());
+    assertEquals(3, tps.nextPosition());
+
+    // Make sure the doc that has the massive term is in
+    // the index:
+    assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
+
+    reader.close();
+    dir.close();
+    dir = newDirectory();
+
+    // Make sure we can add a document with exactly the
+    // maximum length term, and search on that term:
+    doc = new Document();
+    Field contentField = new Field("content", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
+    doc.add(contentField);
+
+    w = new RandomIndexWriter(random, dir);
+
+    contentField.setValue("other");
+    w.addDocument(doc);
+
+    contentField.setValue("term");
+    w.addDocument(doc);
+
+    contentField.setValue(bigTerm);
+    w.addDocument(doc);
+
+    contentField.setValue("zzz");
+    w.addDocument(doc);
+
+    reader = w.getReader();
+    w.close();
+    assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
+
+    FieldCache.DocTermsIndex dti = FieldCache.DEFAULT.getTermsIndex(reader, "content", random.nextBoolean());
+    assertEquals(5, dti.numOrd());                // +1 for null ord
+    assertEquals(4, dti.size());
+    assertEquals(bigTermBytesRef, dti.lookup(3, new BytesRef()));
+    reader.close();
+    dir.close();
+  }
 }