diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 57e3a96dc6f..6f6d4dcc415 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -33,6 +33,11 @@ New features segment merges to give better search performance in a mixed indexing/searching environment. (John Wang via Mike McCandless) + * LUCENE-1959: Add IndexSplitter tool, to copy specific segments out + of the index into a new index. It can also list the segments in + the index, and delete specified segments. (Jason Rutherglen via + Mike McCandless) + Optimizations Documentation diff --git a/contrib/misc/src/java/org/apache/lucene/index/IndexSplitter.java b/contrib/misc/src/java/org/apache/lucene/index/IndexSplitter.java new file mode 100644 index 00000000000..3683d9ceb4a --- /dev/null +++ b/contrib/misc/src/java/org/apache/lucene/index/IndexSplitter.java @@ -0,0 +1,163 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.text.DecimalFormat; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.store.FSDirectory; + +/** + * Command-line tool that enables listing segments in an + * index, copying specific segments to another index, and + * deleting segments from an index. + * + *

This tool does file-level copying of segments files. + * This means it's unable to split apart a single segment + * into multiple segments. For example if your index is + * optimized, this tool won't help. Also, it does basic + * file-level copying (using simple + * File{In,Out}putStream) so it will not work with non + * FSDirectory Directory impls.

+ * + *

NOTE: The tool is experimental and might change + * in incompatible ways in the next release. You can easily + * accidentally remove segments from your index so be + * careful! + */ +public class IndexSplitter { + public SegmentInfos infos; + + FSDirectory fsDir; + + File dir; + + /** + * @param args + */ + public static void main(String[] args) throws Exception { + if (args.length < 2) { + System.err + .println("Usage: IndexSplitter -l (list the segments and their sizes)"); + System.err.println("IndexSplitter +"); + System.err + .println("IndexSplitter -d (delete the following segments)"); + return; + } + File srcDir = new File(args[0]); + IndexSplitter is = new IndexSplitter(srcDir); + if (!srcDir.exists()) { + throw new Exception("srcdir:" + srcDir.getAbsolutePath() + + " doesn't exist"); + } + if (args[1].equals("-l")) { + is.listSegments(); + } else if (args[1].equals("-d")) { + List segs = new ArrayList(); + for (int x = 2; x < args.length; x++) { + segs.add(args[x]); + } + is.remove((String[]) segs.toArray(new String[0])); + } else { + File targetDir = new File(args[1]); + List segs = new ArrayList(); + for (int x = 2; x < args.length; x++) { + segs.add(args[x]); + } + is.split(targetDir, (String[]) segs.toArray(new String[0])); + } + } + + public IndexSplitter(File dir) throws IOException { + this.dir = dir; + fsDir = FSDirectory.open(dir); + infos = new SegmentInfos(); + infos.read(fsDir); + } + + public void listSegments() throws IOException { + DecimalFormat formatter = new DecimalFormat("###,###.###"); + for (int x = 0; x < infos.size(); x++) { + SegmentInfo info = infos.info(x); + String sizeStr = formatter.format(info.sizeInBytes()); + System.out.println(info.name + " " + sizeStr); + } + } + + private int getIdx(String name) { + for (int x = 0; x < infos.size(); x++) { + if (name.equals(infos.info(x).name)) + return x; + } + return -1; + } + + private SegmentInfo getInfo(String name) { + for (int x = 0; x < infos.size(); x++) { + if (name.equals(infos.info(x).name)) + return infos.info(x); + } + return null; + } + + public void remove(String[] segs) throws IOException { + for (String n : segs) { + int idx = getIdx(n); + infos.remove(idx); + } + infos.commit(fsDir); + } + + public void split(File destDir, String[] segs) throws IOException { + destDir.mkdirs(); + FSDirectory destFSDir = FSDirectory.open(destDir); + SegmentInfos destInfos = new SegmentInfos(); + for (String n : segs) { + SegmentInfo info = getInfo(n); + destInfos.add(info); + // now copy files over + List files = info.files(); + for (int x = 0; x < files.size(); x++) { + String srcName = (String) files.get(x); + File srcFile = new File(dir, srcName); + File destFile = new File(destDir, srcName); + copyFile(srcFile, destFile); + } + } + destInfos.commit(destFSDir); + // System.out.println("destDir:"+destDir.getAbsolutePath()); + } + + private static void copyFile(File src, File dst) throws IOException { + InputStream in = new FileInputStream(src); + OutputStream out = new FileOutputStream(dst); + byte[] buf = new byte[32*1024]; + int len; + while ((len = in.read(buf)) > 0) { + out.write(buf, 0, len); + } + in.close(); + out.close(); + } +} diff --git a/contrib/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java b/contrib/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java new file mode 100644 index 00000000000..77cc4a49ff0 --- /dev/null +++ b/contrib/misc/src/test/org/apache/lucene/index/TestIndexSplitter.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.File; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +public class TestIndexSplitter extends LuceneTestCase { + public void test() throws Exception { + String tmpDir = System.getProperty("java.io.tmpdir"); + File dir = new File(tmpDir, "testfilesplitter"); + _TestUtil.rmDir(dir); + dir.mkdirs(); + File destDir = new File(tmpDir, "testfilesplitterdest"); + _TestUtil.rmDir(destDir); + destDir.mkdirs(); + FSDirectory fsDir = FSDirectory.open(dir); + IndexWriter iw = new IndexWriter(fsDir, new WhitespaceAnalyzer(), true, MaxFieldLength.UNLIMITED); + for (int x=0; x < 100; x++) { + Document doc = TestIndexWriterReader.createDocument(x, "index", 5); + iw.addDocument(doc); + } + iw.commit(); + for (int x=100; x < 150; x++) { + Document doc = TestIndexWriterReader.createDocument(x, "index2", 5); + iw.addDocument(doc); + } + iw.commit(); + for (int x=150; x < 200; x++) { + Document doc = TestIndexWriterReader.createDocument(x, "index3", 5); + iw.addDocument(doc); + } + iw.commit(); + assertEquals(3, iw.getReader().getSequentialSubReaders().length); + iw.close(); + // we should have 2 segments now + IndexSplitter is = new IndexSplitter(dir); + String splitSegName = is.infos.info(1).name; + is.split(destDir, new String[] {splitSegName}); + IndexReader r = IndexReader.open(FSDirectory.open(destDir), true); + assertEquals(50, r.maxDoc()); + + // now test cmdline + File destDir2 = new File(tmpDir, "testfilesplitterdest2"); + _TestUtil.rmDir(destDir2); + destDir2.mkdirs(); + IndexSplitter.main(new String[] {dir.getAbsolutePath(), destDir2.getAbsolutePath(), splitSegName}); + assertEquals(3, destDir2.listFiles().length); + r = IndexReader.open(FSDirectory.open(destDir2), true); + assertEquals(50, r.maxDoc()); + + // now remove the copied segment from src + IndexSplitter.main(new String[] {dir.getAbsolutePath(), "-d", splitSegName}); + r = IndexReader.open(FSDirectory.open(dir), true); + assertEquals(2, r.getSequentialSubReaders().length); + } +}