LUCENE-4203: add IndexWriter.tryDeleteDocument

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1368745 13f79535-47bb-0310-9956-ffa450edef68
2012-08-02 22:46:27 +00:00 · 2012-08-02 22:46:27 +00:00 · 21e7728e57
parent 4e099f3571
commit 21e7728e57
6 changed files with 143 additions and 7 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -43,6 +43,11 @@ New features
  implementations to optimize the enum implementation.  (Robert Muir,
  Mike McCandless)

+* LUCENE-4203: Add IndexWriter.tryDeleteDocument(AtomicReader reader,
+  int docID), to attempt deletion by docID as long as the provided
+  reader is an NRT reader, and the segment has not yet been merged
+  away (Mike McCandless).
+
 API Changes

 * LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
--- a/lucene/core/src/java/org/apache/lucene/index/AtomicReaderContext.java
+++ b/lucene/core/src/java/org/apache/lucene/index/AtomicReaderContext.java
@ -1,8 +1,5 @@
 package org.apache.lucene.index;

-import java.util.Collections;
-import java.util.List;
-
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -20,6 +17,9 @@ import java.util.List;
 * limitations under the License.
 */

+import java.util.Collections;
+import java.util.List;
+
 /**
 * {@link IndexReaderContext} for {@link AtomicReader} instances
 * @lucene.experimental
@ -51,8 +51,9 @@ public final class AtomicReaderContext extends IndexReaderContext {
  
  @Override
  public List<AtomicReaderContext> leaves() {
-    if (!isTopLevel)
+    if (!isTopLevel) {
      throw new UnsupportedOperationException("This is not a top-level context.");
+    }
    assert leaves != null;
    return leaves;
  }
--- a/lucene/core/src/java/org/apache/lucene/index/IndexReaderContext.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexReaderContext.java
@ -1,7 +1,5 @@
 package org.apache.lucene.index;

-import java.util.List;
-
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -19,6 +17,8 @@ import java.util.List;
 * limitations under the License.
 */

+import java.util.List;
+
 /**
 * A struct like class that represents a hierarchical relationship between
 * {@link IndexReader} instances. 
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@ -1241,6 +1241,78 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
    }
  }

+  /** Expert: attempts to delete by document ID, as long as
+   *  the provided reader is a near-real-time reader (from {@link
+   *  DirectoryReader#open(IndexWriter,boolean)}).  If the
+   *  provided reader is an NRT reader obtained from this
+   *  writer, and its segment has not been merged away, then
+   *  the delete succeeds and this method returns true; else, it
+   *  returns false the caller must then separately delete by
+   *  Term or Query.
+   *
+   *  <b>NOTE</b>: this method can only delete documents
+   *  visible to the currently open NRT reader.  If you need
+   *  to delete documents indexed after opening the NRT
+   *  reader you must use the other deleteDocument methods
+   *  (e.g., {@link #deleteDocuments(Term)}). */
+  public synchronized boolean tryDeleteDocument(IndexReader readerIn, int docID) throws IOException {
+
+    final AtomicReader reader;
+    if (readerIn instanceof AtomicReader) {
+      // Reader is already atomic: use the incoming docID:
+      reader = (AtomicReader) readerIn;
+    } else {
+      // Composite reader: lookup sub-reader and re-base docID:
+      List<AtomicReaderContext> leaves = readerIn.getTopReaderContext().leaves();
+      int subIndex = ReaderUtil.subIndex(docID, leaves);
+      reader = leaves.get(subIndex).reader();
+      docID -= leaves.get(subIndex).docBase;
+      assert docID >= 0;
+      assert docID < reader.maxDoc();
+    }
+
+    if (!(reader instanceof SegmentReader)) {
+      throw new IllegalArgumentException("the reader must be a SegmentReader or composite reader containing only SegmentReaders");
+    }
+      
+    final SegmentInfoPerCommit info = ((SegmentReader) reader).getSegmentInfo();
+
+    // TODO: this is a slow linear search, but, number of
+    // segments should be contained unless something is
+    // seriously wrong w/ the index, so it should be a minor
+    // cost:
+
+    if (segmentInfos.indexOf(info) != -1) {
+      ReadersAndLiveDocs rld = readerPool.get(info, false);
+      if (rld != null) {
+        synchronized(bufferedDeletesStream) {
+          rld.initWritableLiveDocs();
+          if (rld.delete(docID)) {
+            final int fullDelCount = rld.info.getDelCount() + rld.getPendingDeleteCount();
+            if (fullDelCount == rld.info.info.getDocCount()) {
+              // If a merge has already registered for this
+              // segment, we leave it in the readerPool; the
+              // merge will skip merging it and will then drop
+              // it once it's done:
+              if (!mergingSegments.contains(rld.info)) {
+                segmentInfos.remove(rld.info);
+                readerPool.drop(rld.info);
+                checkpoint();
+              }
+            }
+          }
+          //System.out.println("  yes " + info.info.name + " " + docID);
+          return true;
+        }
+      } else {
+        //System.out.println("  no rld " + info.info.name + " " + docID);
+      }
+    } else {
+      //System.out.println("  no seg " + info.info.name + " " + docID);
+    }
+    return false;
+  }
+
  /**
   * Deletes the document(s) containing any of the
   * terms. All given deletes are applied and flushed atomically
--- a/lucene/core/src/java/org/apache/lucene/search/NRTManager.java
+++ b/lucene/core/src/java/org/apache/lucene/search/NRTManager.java
@ -27,6 +27,7 @@ import java.util.concurrent.locks.ReentrantLock;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.SegmentInfoPerCommit;
 import org.apache.lucene.index.IndexReader; // javadocs
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexableField;
@ -254,6 +255,14 @@ public class NRTManager extends ReferenceManager<IndexSearcher> {
    long getAndIncrementGeneration() {
      return indexingGen.getAndIncrement();
    }
+
+    public long tryDeleteDocument(IndexReader reader, int docID) throws IOException {
+      if (writer.tryDeleteDocument(reader, docID)) {
+        return indexingGen.get();
+      } else {
+        return -1;
+      }
+    }
  }

  /**
--- a/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java
@ -23,6 +23,9 @@ import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
 import org.apache.lucene.document.*;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.*;
 import org.apache.lucene.util.*;
 import org.junit.Test;
@ -48,10 +51,13 @@ public class TestRollingUpdates extends LuceneTestCase {
    final int SIZE = atLeast(20);
    int id = 0;
    IndexReader r = null;
+    IndexSearcher s = null;
    final int numUpdates = (int) (SIZE * (2+(TEST_NIGHTLY ? 200*random().nextDouble() : 5*random().nextDouble())));
    if (VERBOSE) {
      System.out.println("TEST: numUpdates=" + numUpdates);
    }
+    int updateCount = 0;
+    // TODO: sometimes update ids not in order...
    for(int docIter=0;docIter<numUpdates;docIter++) {
      final Document doc = docs.nextDoc();
      final String myID = ""+id;
@ -60,16 +66,59 @@ public class TestRollingUpdates extends LuceneTestCase {
      } else {
        id++;
      }
+      if (VERBOSE) {
+        System.out.println("  docIter=" + docIter + " id=" + id);
+      }
      ((Field) doc.getField("docid")).setStringValue(myID);
-      w.updateDocument(new Term("docid", myID), doc);
+
+      Term idTerm = new Term("docid", myID);
+
+      final boolean doUpdate;
+      if (s != null && updateCount < SIZE) {
+        TopDocs hits = s.search(new TermQuery(idTerm), 1);
+        assertEquals(1, hits.totalHits);
+        doUpdate = !w.tryDeleteDocument(r, hits.scoreDocs[0].doc);
+        if (VERBOSE) {
+          if (doUpdate) {
+            System.out.println("  tryDeleteDocument failed");
+          } else {
+            System.out.println("  tryDeleteDocument succeeded");
+          }
+        }
+      } else {
+        doUpdate = true;
+        if (VERBOSE) {
+          System.out.println("  no searcher: doUpdate=true");
+        }
+      }
+
+      updateCount++;
+
+      if (doUpdate) {
+        w.updateDocument(idTerm, doc);
+      } else {
+        w.addDocument(doc);
+      }

      if (docIter >= SIZE && random().nextInt(50) == 17) {
        if (r != null) {
          r.close();
        }
+
        final boolean applyDeletions = random().nextBoolean();
+
+        if (VERBOSE) {
+          System.out.println("TEST: reopen applyDeletions=" + applyDeletions);
+        }
+
        r = w.getReader(applyDeletions);
+        if (applyDeletions) {
+          s = new IndexSearcher(r);
+        } else {
+          s = null;
+        }
        assertTrue("applyDeletions=" + applyDeletions + " r.numDocs()=" + r.numDocs() + " vs SIZE=" + SIZE, !applyDeletions || r.numDocs() == SIZE);
+        updateCount = 0;
      }
    }