From bba6b01826dc18e2c2d0977eac375b3a565300da Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Fri, 17 Jun 2011 22:52:55 +0000 Subject: [PATCH] LUCENE-2919: add PKIndexSplitter, to split index into two by term git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1137064 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 4 + .../apache/lucene/index/PKIndexSplitter.java | 136 ++++++++++++++++++ .../lucene/index/TestPKIndexSplitter.java | 93 ++++++++++++ 3 files changed, 233 insertions(+) create mode 100644 lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java create mode 100644 lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 118dc5787ea..04cea37111c 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -79,6 +79,10 @@ New Features facilitate doing grouping in a distributed environment (Uwe Schindler, Mike McCandless) + * LUCENE-2919: Added PKIndexSplitter, that splits an index according + to a middle term in a specified field. (Jason Rutherglen via Mike + McCandless) + API Changes * LUCENE-3141: add getter method to access fragInfos in FieldFragList. diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java new file mode 100644 index 00000000000..1bae43b42c8 --- /dev/null +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/PKIndexSplitter.java @@ -0,0 +1,136 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.Version; + +/** + * Split an index based on a given primary key term + * and a 'middle' term. If the middle term is present, it's + * sent to dir2. + */ +public class PKIndexSplitter { + private Term midTerm; + Directory input; + Directory dir1; + Directory dir2; + + public PKIndexSplitter(Term midTerm, Directory input, + Directory dir1, Directory dir2) { + this.midTerm = midTerm; + this.input = input; + this.dir1 = dir1; + this.dir2 = dir2; + } + + public void split() throws IOException { + IndexReader reader = IndexReader.open(input); + OpenBitSet lowDels = setDeletes(reader, null, midTerm.bytes()); + OpenBitSet hiDels = setDeletes(reader, midTerm.bytes(), null); + + createIndex(dir1, reader, lowDels); + createIndex(dir2, reader, hiDels); + reader.close(); + } + + private void createIndex(Directory target, IndexReader reader, OpenBitSet bv) throws IOException { + IndexWriter w = new IndexWriter(target, new IndexWriterConfig( + Version.LUCENE_CURRENT, + new WhitespaceAnalyzer(Version.LUCENE_CURRENT)) + .setOpenMode(OpenMode.CREATE)); + w.addIndexes(new DeletesIndexReader(reader, bv)); + w.close(); + } + + private OpenBitSet setDeletes(IndexReader reader, BytesRef startTerm, + BytesRef endTermExcl) throws IOException { + OpenBitSet incl = new OpenBitSet(reader.maxDoc()); + Terms terms = MultiFields.getTerms(reader, midTerm.field()); + TermsEnum te = terms.iterator(); + if (startTerm != null) { + te.seek(startTerm); + } + while (true) { + final BytesRef term = te.next(); + if (term == null) { + break; + } + if (endTermExcl != null && term.compareTo(endTermExcl) >= 0) { + break; + } + DocsEnum docs = MultiFields.getTermDocsEnum(reader, + MultiFields.getDeletedDocs(reader), midTerm.field(), term); + while (true) { + final int doc = docs.nextDoc(); + if (doc != DocsEnum.NO_MORE_DOCS) { + incl.set(doc); + } else break; + } + } + OpenBitSet dels = new OpenBitSet(reader.maxDoc()); + for (int x=0; x < reader.maxDoc(); x++) { + if (!incl.get(x)) { + dels.set(x); + } + } + return dels; + } + + public static class DeletesIndexReader extends FilterIndexReader { + OpenBitSet readerDels; + + public DeletesIndexReader(IndexReader reader, OpenBitSet deletes) { + super(new SlowMultiReaderWrapper(reader)); + readerDels = new OpenBitSet(reader.maxDoc()); + if (in.hasDeletions()) { + final Bits oldDelBits = MultiFields.getDeletedDocs(in); + assert oldDelBits != null; + for (int i = 0; i < in.maxDoc(); i++) { + if (oldDelBits.get(i) || deletes.get(i)) { + readerDels.set(i); + } + } + } else { + readerDels = deletes; + } + } + + @Override + public int numDocs() { + return in.maxDoc() - (int)readerDels.cardinality(); + } + + @Override + public boolean hasDeletions() { + return (int)readerDels.cardinality() > 0; + } + + @Override + public Bits getDeletedDocs() { + return readerDels; + } + } +} diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java new file mode 100644 index 00000000000..130c0cc559f --- /dev/null +++ b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestPKIndexSplitter.java @@ -0,0 +1,93 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +import java.text.DecimalFormat; +import java.text.NumberFormat; + +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Version; +import org.apache.lucene.util.LuceneTestCase; + + +public class TestPKIndexSplitter extends LuceneTestCase { + public void testSplit() throws Exception { + NumberFormat format = new DecimalFormat("000000000"); + + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig( + Version.LUCENE_CURRENT, + new WhitespaceAnalyzer(Version.LUCENE_CURRENT)) + .setOpenMode(OpenMode.CREATE)); + for (int x=0; x < 10; x++) { + Document doc = createDocument(x, "1", 3, format); + w.addDocument(doc); + } + for (int x=15; x < 20; x++) { + Document doc = createDocument(x, "2", 3, format); + w.addDocument(doc); + } + w.close(); + + Directory dir1 = newDirectory(); + Directory dir2 = newDirectory(); + Term splitTerm = new Term("id", new BytesRef(format.format(11))); + PKIndexSplitter splitter = new PKIndexSplitter(splitTerm, + dir, dir1, dir2); + splitter.split(); + + IndexReader ir1 = IndexReader.open(dir1); + IndexReader ir2 = IndexReader.open(dir2); + assertEquals(10, ir1.maxDoc()); + assertEquals(4, ir2.maxDoc()); + + ir1.close(); + ir2.close(); + + dir1.close(); + dir2.close(); + dir.close(); + } + + public Document createDocument(int n, String indexName, + int numFields, NumberFormat format) { + StringBuilder sb = new StringBuilder(); + Document doc = new Document(); + String id = format.format(n); + doc.add(new Field("id", id, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(new Field("indexname", indexName, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); + sb.append("a"); + sb.append(n); + doc.add(new Field("field1", sb.toString(), Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); + sb.append(" b"); + sb.append(n); + for (int i = 1; i < numFields; i++) { + doc.add(new Field("field" + (i + 1), sb.toString(), Store.YES, + Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); + } + return doc; + } +}