LUCENE-1072: make sure on hitting a too-long term that IndexWriter is still usable

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@600465 13f79535-47bb-0310-9956-ffa450edef68
2025-02-22 01:56:16 +00:00 · 2007-12-03 10:09:10 +00:00 · 2007-12-03 10:09:10 +00:00 · ddf136d928
commit ddf136d928
parent 11a7bf835b
3 changed files with 70 additions and 28 deletions
--- a/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriter.java
@ -354,7 +354,7 @@ final class DocumentsWriter {
        state.tvfLocal.reset();
        state.fdtLocal.reset();
      }
-
+      docStoreSegment = null;
      files = null;

    } finally {
@ -518,6 +518,7 @@ final class DocumentsWriter {
    int numAllFieldData;
    FieldData[] fieldDataHash;            // Hash FieldData instances by field name
    int fieldDataHashMask;
+    int maxTermHit;                       // Set to > 0 if this doc has a too-large term

    boolean doFlushAfter;

@ -608,6 +609,7 @@ final class DocumentsWriter {
      numStoredFields = 0;
      numFieldData = 0;
      numVectorFields = 0;
+      maxTermHit = 0;

      List docFields = doc.getFields();
      final int numDocFields = docFields.size();
@ -1483,17 +1485,23 @@ final class DocumentsWriter {
            getPostings(postingsFreeList);
          }

-          // Pull next free Posting from free list
-          p = postingsFreeList[--postingsFreeCount];
-
          final int textLen1 = 1+tokenTextLen;
          if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) {
-            if (textLen1 > CHAR_BLOCK_SIZE)
-              throw new IllegalArgumentException("term length " + tokenTextLen + " exceeds max term length " + (CHAR_BLOCK_SIZE-1));
+            if (textLen1 > CHAR_BLOCK_SIZE) {
+              maxTermHit = tokenTextLen;
+              // Just skip this term; we will throw an
+              // exception after processing all accepted
+              // terms in the doc
+              return;
+            }
            charPool.nextBuffer();
          }
          final char[] text = charPool.buffer;
          final int textUpto = charPool.byteUpto;
+
+          // Pull next free Posting from free list
+          p = postingsFreeList[--postingsFreeCount];
+
          p.textStart = textUpto + charPool.byteOffset;
          charPool.byteUpto += textLen1;

@ -2181,26 +2189,28 @@ final class DocumentsWriter {

  /** Returns true if the caller (IndexWriter) should now
   * flush. */
-  boolean addDocument(Document doc, Analyzer analyzer)
+  int addDocument(Document doc, Analyzer analyzer)
    throws CorruptIndexException, IOException {
    return updateDocument(doc, analyzer, null);
  }

-  boolean updateDocument(Term t, Document doc, Analyzer analyzer)
+  int updateDocument(Term t, Document doc, Analyzer analyzer)
    throws CorruptIndexException, IOException {
    return updateDocument(doc, analyzer, t);
  }

-  boolean updateDocument(Document doc, Analyzer analyzer, Term delTerm)
+  int updateDocument(Document doc, Analyzer analyzer, Term delTerm)
    throws CorruptIndexException, IOException {

    // This call is synchronized but fast
    final ThreadState state = getThreadState(doc, delTerm);
    boolean success = false;
+    int maxTermHit;
    try {
      // This call is not synchronized and does all the work
      state.processDocument(analyzer);
      // This call synchronized but fast
+      maxTermHit = state.maxTermHit;
      finishDocument(state);
      success = true;
    } finally {
@ -2209,7 +2219,11 @@ final class DocumentsWriter {
        abort();
      }
    }
-    return state.doFlushAfter || timeToFlushDeletes();
+
+    int status = maxTermHit<<1;
+    if (state.doFlushAfter || timeToFlushDeletes())
+      status += 1;
+    return status;
  }

  synchronized int getNumBufferedDeleteTerms() {
--- a/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/src/java/org/apache/lucene/index/IndexWriter.java
@ -1426,10 +1426,10 @@ public class IndexWriter {
   */
  public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
    ensureOpen();
-    boolean doFlush = false;
+    int status = 0;
    boolean success = false;
    try {
-      doFlush = docWriter.addDocument(doc, analyzer);
+      status = docWriter.addDocument(doc, analyzer);
      success = true;
    } finally {
      if (!success) {
@ -1446,8 +1446,9 @@ public class IndexWriter {
        }
      }
    }
-    if (doFlush)
+    if ((status & 1) != 0)
      flush(true, false);
+    checkMaxTermLength(status);
  }

  /**
@ -1511,10 +1512,10 @@ public class IndexWriter {
  public void updateDocument(Term term, Document doc, Analyzer analyzer)
      throws CorruptIndexException, IOException {
    ensureOpen();
-    boolean doFlush = false;
+    int status = 0;
    boolean success = false;
    try {
-      doFlush = docWriter.updateDocument(term, doc, analyzer);
+      status = docWriter.updateDocument(term, doc, analyzer);
      success = true;
    } finally {
      if (!success) {
@ -1531,8 +1532,17 @@ public class IndexWriter {
        }
      }
    }
-    if (doFlush)
+    if ((status & 1) != 0)
      flush(true, false);
+    checkMaxTermLength(status);
+  }
+
+  /** Throws IllegalArgumentException if the return status
+   *  from DocumentsWriter.{add,update}Document indicates
+   *  that a too-long term was encountered */
+  final private void checkMaxTermLength(int status) {
+    if (status > 1)
+      throw new IllegalArgumentException("at least one term (length " + (status>>1) + ") exceeds max term length " + (DocumentsWriter.CHAR_BLOCK_SIZE-1) + "; these terms were skipped");
  }

  // for test purpose
--- a/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/src/test/org/apache/lucene/index/TestIndexWriter.java
@ -28,8 +28,6 @@ import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Hits;
 import org.apache.lucene.search.TermQuery;
@ -221,12 +219,8 @@ public class TestIndexWriter extends LuceneTestCase
          methodName = "addIndexesNoOptimize(Directory[])";
        }

-        int cycleCount = 0;
-
        while(!done) {

-          cycleCount++;
-
          // Make a new dir that will enforce disk usage:
          MockRAMDirectory dir = new MockRAMDirectory(startDir);
          writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), false);
@ -524,7 +518,7 @@ public class TestIndexWriter extends LuceneTestCase
      String[] startFiles = dir.list();
      SegmentInfos infos = new SegmentInfos();
      infos.read(dir);
-      IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
+      new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
      String[] endFiles = dir.list();

      Arrays.sort(startFiles);
@ -543,17 +537,44 @@ public class TestIndexWriter extends LuceneTestCase
      RAMDirectory dir = new RAMDirectory();
      IndexWriter writer  = new IndexWriter(dir, new StandardAnalyzer(), true);

-      char[] chars = new char[16384];
+      char[] chars = new char[16383];
      Arrays.fill(chars, 'x');
      Document doc = new Document();
-      String contents = "a b c " + new String(chars);
+      final String bigTerm = new String(chars);
+
+      // Max length term is 16383, so this contents produces
+      // a too-long term:
+      String contents = "abc xyz x" + bigTerm;
      doc.add(new Field("content", contents, Field.Store.NO, Field.Index.TOKENIZED));
      try {
        writer.addDocument(doc);
        fail("did not hit expected exception");
      } catch (IllegalArgumentException e) {
      }
+
+      // Make sure we can add another normal document
+      doc = new Document();
+      doc.add(new Field("content", "abc bbb ccc", Field.Store.NO, Field.Index.TOKENIZED));
+      writer.addDocument(doc);
      writer.close();
+
+      IndexReader reader = IndexReader.open(dir);
+      // Make sure all terms < max size were indexed
+      assertEquals(2, reader.docFreq(new Term("content", "abc")));
+      assertEquals(1, reader.docFreq(new Term("content", "bbb")));
+      reader.close();
+
+      // Make sure we can add a document with exactly the
+      // maximum length term, and search on that term:
+      doc = new Document();
+      doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
+      writer  = new IndexWriter(dir, new StandardAnalyzer());
+      writer.addDocument(doc);
+      writer.close();
+      reader = IndexReader.open(dir);
+      assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
+      reader.close();
+
      dir.close();
    }

@ -1342,7 +1363,6 @@ public class TestIndexWriter extends LuceneTestCase
    public void testDiverseDocs() throws IOException {
      RAMDirectory dir = new RAMDirectory();      
      IndexWriter writer  = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
-      long t0 = System.currentTimeMillis();
      writer.setRAMBufferSizeMB(0.5);
      Random rand = new Random(31415);
      for(int i=0;i<3;i++) {
@ -1381,7 +1401,6 @@ public class TestIndexWriter extends LuceneTestCase
      }
      writer.close();

-      long t1 = System.currentTimeMillis();
      IndexSearcher searcher = new IndexSearcher(dir);
      Hits hits = searcher.search(new TermQuery(new Term("field", "aaa")));
      assertEquals(300, hits.length());
@ -1491,7 +1510,6 @@ public class TestIndexWriter extends LuceneTestCase
        addDoc(writer);
      }
      writer.close();
-      IndexReader reader = IndexReader.open(dir);
      Term searchTerm = new Term("content", "aaa");        
      IndexSearcher searcher = new IndexSearcher(dir);
      Hits hits = searcher.search(new TermQuery(searchTerm));