LUCENE-2919: add PKIndexSplitter, to split index into two by term

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1137064 13f79535-47bb-0310-9956-ffa450edef68
2011-06-17 22:52:55 +00:00 · 2011-06-17 22:52:55 +00:00 · bba6b01826
parent f32a6aa455
commit bba6b01826
3 changed files with 233 additions and 0 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -79,6 +79,10 @@ New Features
   facilitate doing grouping in a distributed environment (Uwe
   Schindler, Mike McCandless)

+ * LUCENE-2919: Added PKIndexSplitter, that splits an index according
+   to a middle term in a specified field.  (Jason Rutherglen via Mike
+   McCandless)
+
 API Changes

 * LUCENE-3141: add getter method to access fragInfos in FieldFragList.
--- a/lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java
+++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java
@ -0,0 +1,136 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.OpenBitSet;
+import org.apache.lucene.util.Version;
+
+/**
+ * Split an index based on a given primary key term 
+ * and a 'middle' term.  If the middle term is present, it's
+ * sent to dir2.
+ */
+public class PKIndexSplitter {
+  private Term midTerm;
+  Directory input;
+  Directory dir1;
+  Directory dir2; 
+  
+  public PKIndexSplitter(Term midTerm, Directory input, 
+      Directory dir1, Directory dir2) {
+    this.midTerm = midTerm;
+    this.input = input;
+    this.dir1 = dir1;
+    this.dir2 = dir2;
+  }
+  
+  public void split() throws IOException {
+    IndexReader reader = IndexReader.open(input);
+    OpenBitSet lowDels = setDeletes(reader, null, midTerm.bytes());
+    OpenBitSet hiDels = setDeletes(reader, midTerm.bytes(), null);
+    
+    createIndex(dir1, reader, lowDels);
+    createIndex(dir2, reader, hiDels);
+    reader.close();
+  }
+  
+  private void createIndex(Directory target, IndexReader reader, OpenBitSet bv) throws IOException {
+    IndexWriter w = new IndexWriter(target, new IndexWriterConfig(
+        Version.LUCENE_CURRENT,
+        new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
+        .setOpenMode(OpenMode.CREATE));
+    w.addIndexes(new DeletesIndexReader(reader, bv));
+    w.close();
+  }
+  
+  private OpenBitSet setDeletes(IndexReader reader, BytesRef startTerm, 
+      BytesRef endTermExcl) throws IOException {
+    OpenBitSet incl = new OpenBitSet(reader.maxDoc());
+    Terms terms = MultiFields.getTerms(reader, midTerm.field());
+    TermsEnum te = terms.iterator();
+    if (startTerm != null) {
+      te.seek(startTerm);
+    }
+    while (true) {
+      final BytesRef term = te.next();
+      if (term == null) {
+        break;
+      }
+      if (endTermExcl != null && term.compareTo(endTermExcl) >= 0) {
+        break;
+      }
+      DocsEnum docs = MultiFields.getTermDocsEnum(reader, 
+          MultiFields.getDeletedDocs(reader), midTerm.field(), term);
+      while (true) {
+        final int doc = docs.nextDoc();
+        if (doc != DocsEnum.NO_MORE_DOCS) {
+          incl.set(doc);
+        } else break;
+      }
+    }
+    OpenBitSet dels = new OpenBitSet(reader.maxDoc());
+    for (int x=0; x < reader.maxDoc(); x++) {
+      if (!incl.get(x)) {
+        dels.set(x);
+      }
+    }
+    return dels;
+  }
+  
+  public static class DeletesIndexReader extends FilterIndexReader {
+    OpenBitSet readerDels;
+    
+    public DeletesIndexReader(IndexReader reader, OpenBitSet deletes) {
+      super(new SlowMultiReaderWrapper(reader));
+      readerDels = new OpenBitSet(reader.maxDoc());
+      if (in.hasDeletions()) {
+        final Bits oldDelBits = MultiFields.getDeletedDocs(in);
+        assert oldDelBits != null;
+        for (int i = 0; i < in.maxDoc(); i++) {
+          if (oldDelBits.get(i) || deletes.get(i)) {
+            readerDels.set(i);
+          }
+        }
+      } else {
+        readerDels = deletes;
+      }
+    }
+    
+    @Override
+    public int numDocs() {
+      return in.maxDoc() - (int)readerDels.cardinality();
+    }
+    
+    @Override
+    public boolean hasDeletions() {
+      return (int)readerDels.cardinality() > 0;
+    }
+    
+    @Override
+    public Bits getDeletedDocs() {
+      return readerDels;
+    }
+  }
+}
--- a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java
+++ b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java
@ -0,0 +1,93 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.util.LuceneTestCase;
+
+
+public class TestPKIndexSplitter extends LuceneTestCase {
+  public void testSplit() throws Exception {
+    NumberFormat format = new DecimalFormat("000000000");
+    
+    Directory dir = newDirectory();
+    IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(
+        Version.LUCENE_CURRENT,
+        new WhitespaceAnalyzer(Version.LUCENE_CURRENT))
+        .setOpenMode(OpenMode.CREATE));
+    for (int x=0; x < 10; x++) {
+      Document doc = createDocument(x, "1", 3, format);
+      w.addDocument(doc);
+    }
+    for (int x=15; x < 20; x++) {
+      Document doc = createDocument(x, "2", 3, format);
+      w.addDocument(doc);
+    }
+    w.close();
+    
+    Directory dir1 = newDirectory();
+    Directory dir2 = newDirectory();
+    Term splitTerm = new Term("id", new BytesRef(format.format(11)));
+    PKIndexSplitter splitter = new PKIndexSplitter(splitTerm, 
+        dir, dir1, dir2);
+    splitter.split();
+    
+    IndexReader ir1 = IndexReader.open(dir1);
+    IndexReader ir2 = IndexReader.open(dir2);
+    assertEquals(10, ir1.maxDoc());
+    assertEquals(4, ir2.maxDoc());
+    
+    ir1.close();
+    ir2.close();
+    
+    dir1.close();
+    dir2.close();
+    dir.close();
+  }
+  
+  public Document createDocument(int n, String indexName, 
+      int numFields, NumberFormat format) {
+    StringBuilder sb = new StringBuilder();
+    Document doc = new Document();
+    String id = format.format(n);
+    doc.add(new Field("id", id, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
+    doc.add(new Field("indexname", indexName, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
+    sb.append("a");
+    sb.append(n);
+    doc.add(new Field("field1", sb.toString(), Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
+    sb.append(" b");
+    sb.append(n);
+    for (int i = 1; i < numFields; i++) {
+      doc.add(new Field("field" + (i + 1), sb.toString(), Store.YES,
+                        Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
+    }
+    return doc;
+  }
+}