diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 39ccb82a682..9b5db8efa51 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -41,10 +41,13 @@ New features segment merges to give better search performance in a mixed indexing/searching environment. (John Wang via Mike McCandless) - * LUCENE-1959: Add IndexSplitter tool, to copy specific segments out - of the index into a new index. It can also list the segments in - the index, and delete specified segments. (Jason Rutherglen via - Mike McCandless) + * LUCENE-1959: Add index splitting tools. The IndexSplitter tool works + on multi-segment (non optimized) indexes and it can copy specific + segments out of the index into a new index. It can also list the + segments in the index, and delete specified segments. (Jason Rutherglen via + Mike McCandless). MultiPassIndexSplitter can split any index into + any number of output parts, at the cost of doing multiple passes over + the input index. (Andrzej Bialecki) Optimizations diff --git a/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java b/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java new file mode 100644 index 00000000000..f13c9f87c89 --- /dev/null +++ b/contrib/misc/src/java/org/apache/lucene/index/MultiPassIndexSplitter.java @@ -0,0 +1,236 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.OpenBitSet; + +/** + * This tool splits input index into multiple equal parts. The method employed + * here uses {@link IndexWriter#addIndexes(IndexReader[])} where the input data + * comes from the input index with artificially applied deletes to the document + * id-s that fall outside the selected partition. + *

Note 1: Deletes are only applied to a buffered list of deleted docs and + * don't affect the source index - this tool works also with read-only indexes. + *

Note 2: the disadvantage of this tool is that source index needs to be + * read as many times as there are parts to be created, hence the name of this + * tool. + */ +public class MultiPassIndexSplitter { + + /** + * Split source index into multiple parts. + * @param input source index, can be read-only, can have deletions, can have + * multiple segments (or multiple readers). + * @param outputs list of directories where the output parts will be stored. + * @param seq if true, then the source index will be split into equal + * increasing ranges of document id-s. If false, source document id-s will be + * assigned in a deterministic round-robin fashion to one of the output splits. + * @throws IOException + */ + public void split(IndexReader input, Directory[] outputs, boolean seq) throws IOException { + if (outputs == null || outputs.length < 2) { + throw new IOException("Invalid number of outputs."); + } + if (input == null || input.numDocs() < 2) { + throw new IOException("Not enough documents for splitting"); + } + int numParts = outputs.length; + // wrap a potentially read-only input + // this way we don't have to preserve original deletions because neither + // deleteDocument(int) or undeleteAll() is applied to the wrapped input index. + input = new FakeDeleteIndexReader(input); + int maxDoc = input.maxDoc(); + int partLen = maxDoc / numParts; + for (int i = 0; i < numParts; i++) { + input.undeleteAll(); + if (seq) { // sequential range + int lo = partLen * i; + int hi = lo + partLen; + // below range + for (int j = 0; j < lo; j++) { + input.deleteDocument(j); + } + // above range - last part collects all id-s that remained due to + // integer rounding errors + if (i < numParts - 1) { + for (int j = hi; j < maxDoc; j++) { + input.deleteDocument(j); + } + } + } else { + // round-robin + for (int j = 0; j < maxDoc; j++) { + if ((j + numParts - i) % numParts != 0) { + input.deleteDocument(j); + } + } + } + IndexWriter w = new IndexWriter(outputs[i], new WhitespaceAnalyzer(), + true, MaxFieldLength.UNLIMITED); + System.err.println("Writing part " + (i + 1) + " ..."); + w.addIndexes(new IndexReader[]{input}); + w.close(); + } + System.err.println("Done."); + } + + public static void main(String[] args) throws Exception { + if (args.length < 5) { + System.err.println("Usage: MultiPassIndexSplitter -out -num [-seq] [ indexes = new ArrayList(); + String outDir = null; + int numParts = -1; + boolean seq = false; + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-out")) { + outDir = args[++i]; + } else if (args[i].equals("-num")) { + numParts = Integer.parseInt(args[++i]); + } else if (args[i].equals("-seq")) { + seq = true; + } else { + File file = new File(args[i]); + if (!file.exists() || !file.isDirectory()) { + System.err.println("Invalid input path - skipping: " + file); + continue; + } + Directory dir = FSDirectory.open(new File(args[i])); + try { + if (!IndexReader.indexExists(dir)) { + System.err.println("Invalid input index - skipping: " + file); + continue; + } + } catch (Exception e) { + System.err.println("Invalid input index - skipping: " + file); + continue; + } + indexes.add(IndexReader.open(dir, true)); + } + } + if (outDir == null) { + throw new Exception("Required argument missing: -out outputDir"); + } + if (numParts < 2) { + throw new Exception("Invalid value of required argument: -num numParts"); + } + if (indexes.size() == 0) { + throw new Exception("No input indexes to process"); + } + File out = new File(outDir); + if (!out.mkdirs()) { + throw new Exception("Can't create output directory: " + out); + } + Directory[] dirs = new Directory[numParts]; + for (int i = 0; i < numParts; i++) { + dirs[i] = FSDirectory.open(new File(out, "part-" + i)); + } + MultiPassIndexSplitter splitter = new MultiPassIndexSplitter(); + IndexReader input; + if (indexes.size() == 1) { + input = indexes.get(0); + } else { + input = new MultiReader((IndexReader[])indexes.toArray(new IndexReader[indexes.size()])); + } + splitter.split(input, dirs, seq); + } + + /** + * This class pretends that it can write deletions to the underlying index. + * Instead, deletions are buffered in a bitset and overlaid with the original + * list of deletions. + */ + public static class FakeDeleteIndexReader extends FilterIndexReader { + OpenBitSet dels; + OpenBitSet oldDels = null; + + public FakeDeleteIndexReader(IndexReader in) { + super(in); + dels = new OpenBitSet(in.maxDoc()); + if (in.hasDeletions()) { + oldDels = new OpenBitSet(in.maxDoc()); + for (int i = 0; i < in.maxDoc(); i++) { + if (in.isDeleted(i)) oldDels.set(i); + } + dels.or(oldDels); + } + } + + @Override + public int numDocs() { + return in.maxDoc() - (int)dels.cardinality(); + } + + /** + * Just removes our overlaid deletions - does not undelete the original + * deletions. + */ + @Override + protected void doUndeleteAll() throws CorruptIndexException, IOException { + dels = new OpenBitSet(in.maxDoc()); + if (oldDels != null) { + dels.or(oldDels); + } + } + + @Override + protected void doDelete(int n) throws CorruptIndexException, IOException { + dels.set(n); + } + + @Override + public boolean hasDeletions() { + return !dels.isEmpty(); + } + + @Override + public boolean isDeleted(int n) { + return dels.get(n); + } + + @Override + public TermPositions termPositions() throws IOException { + return new FilterTermPositions(in.termPositions()) { + + @Override + public boolean next() throws IOException { + boolean res; + while ((res = super.next())) { + if (!dels.get(doc())) { + break; + } + } + return res; + } + }; + } + } +} diff --git a/contrib/misc/src/test/org/apache/lucene/index/TestMultiPassIndexSplitter.java b/contrib/misc/src/test/org/apache/lucene/index/TestMultiPassIndexSplitter.java new file mode 100644 index 00000000000..b7c8ab7791e --- /dev/null +++ b/contrib/misc/src/test/org/apache/lucene/index/TestMultiPassIndexSplitter.java @@ -0,0 +1,127 @@ +package org.apache.lucene.index; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import junit.framework.TestCase; + +public class TestMultiPassIndexSplitter extends TestCase { + IndexReader input; + int NUM_DOCS = 11; + + public void setUp() throws Exception { + RAMDirectory dir = new RAMDirectory(); + IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, + MaxFieldLength.LIMITED); + Document doc; + for (int i = 0; i < NUM_DOCS; i++) { + doc = new Document(); + doc.add(new Field("id", i + "", Field.Store.YES, Field.Index.NOT_ANALYZED)); + doc.add(new Field("f", i + " " + i, Field.Store.YES, Field.Index.ANALYZED)); + w.addDocument(doc); + } + w.close(); + input = IndexReader.open(dir, false); + // delete the last doc + input.deleteDocument(input.maxDoc() - 1); + input = input.reopen(true); + } + + /** + * Test round-robin splitting. + */ + public void testSplitRR() throws Exception { + MultiPassIndexSplitter splitter = new MultiPassIndexSplitter(); + Directory[] dirs = new Directory[]{ + new RAMDirectory(), + new RAMDirectory(), + new RAMDirectory() + }; + splitter.split(input, dirs, false); + IndexReader ir; + ir = IndexReader.open(dirs[0], true); + assertTrue(ir.numDocs() - NUM_DOCS / 3 <= 1); // rounding error + Document doc = ir.document(0); + assertEquals("0", doc.get("id")); + Term t; + TermEnum te; + t = new Term("id", "1"); + te = ir.terms(t); + assertNotSame(t, te.term()); + ir.close(); + ir = IndexReader.open(dirs[1], true); + assertTrue(ir.numDocs() - NUM_DOCS / 3 <= 1); + doc = ir.document(0); + assertEquals("1", doc.get("id")); + t = new Term("id", "0"); + te = ir.terms(t); + assertNotSame(t, te.term()); + ir.close(); + ir = IndexReader.open(dirs[2], true); + assertTrue(ir.numDocs() - NUM_DOCS / 3 <= 1); + doc = ir.document(0); + assertEquals("2", doc.get("id")); + t = new Term("id", "1"); + te = ir.terms(t); + assertNotSame(t, te.term()); + t = new Term("id", "0"); + te = ir.terms(t); + assertNotSame(t, te.term()); + } + + /** + * Test sequential splitting. + */ + public void testSplitSeq() throws Exception { + MultiPassIndexSplitter splitter = new MultiPassIndexSplitter(); + Directory[] dirs = new Directory[]{ + new RAMDirectory(), + new RAMDirectory(), + new RAMDirectory() + }; + splitter.split(input, dirs, true); + IndexReader ir; + ir = IndexReader.open(dirs[0], true); + assertTrue(ir.numDocs() - NUM_DOCS / 3 <= 1); + Document doc = ir.document(0); + assertEquals("0", doc.get("id")); + int start = ir.numDocs(); + ir.close(); + ir = IndexReader.open(dirs[1], true); + assertTrue(ir.numDocs() - NUM_DOCS / 3 <= 1); + doc = ir.document(0); + assertEquals(start + "", doc.get("id")); + start += ir.numDocs(); + ir.close(); + ir = IndexReader.open(dirs[2], true); + assertTrue(ir.numDocs() - NUM_DOCS / 3 <= 1); + doc = ir.document(0); + assertEquals(start + "", doc.get("id")); + // make sure the deleted doc is not here + Term t; + TermEnum te; + t = new Term("id", (NUM_DOCS - 1) + ""); + te = ir.terms(t); + assertNotSame(t, te.term()); + } +}