From 93b83f635dffc782dc70174e5ea377ceab6a8174 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 5 Feb 2020 16:31:07 -0500 Subject: [PATCH] LUCENE-9206: Improve IndexMergeTool defaults and options IndexMergeTool previously had no options and always forceMerge(1) the resulting index. This can result in wasted work and confusing performance (unbalancing the index). Instead the default is to not do anything, except merges from the merge policy. --- lucene/CHANGES.txt | 4 + lucene/MIGRATE.txt | 6 ++ .../apache/lucene/misc/IndexMergeTool.java | 89 ++++++++++++++++--- .../lucene/misc/TestIndexMergeTool.java | 65 ++++++++++++++ 4 files changed, 150 insertions(+), 14 deletions(-) create mode 100644 lucene/misc/src/test/org/apache/lucene/misc/TestIndexMergeTool.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0d6b772a9a5..269e3dda81d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -68,6 +68,10 @@ Improvements * LUCENE-9110: Refactor stack analysis in tests to use generalized LuceneTestCase methods that use StackWalker (Uwe Schindler) +* LUCENE-9206: IndexMergeTool gets additional options to control the merging. + This tool no longer forceMerge(1)s to a single segment by default. If you + rely upon this behavior, pass -max-segments 1 instead. (Robert Muir) + Bug fixes * LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index b7c17e2df1d..c019260e57c 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -1,5 +1,11 @@ # Apache Lucene Migration Guide +## o.a.l.misc.IndexMergeTool defaults changes (LUCENE-9206) ## + +This command-line tool no longer forceMerges to a single segment. Instead, by +default it just follows (configurable) merge policy. If you really want to merge +to a single segment, you can pass -max-segments 1. + ## o.a.l.util.fst.Builder is renamed FSTCompiler with fluent-style Builder (LUCENE-9089) ## Simply use FSTCompiler instead of the previous Builder. Use either the simple constructor with default settings, or diff --git a/lucene/misc/src/java/org/apache/lucene/misc/IndexMergeTool.java b/lucene/misc/src/java/org/apache/lucene/misc/IndexMergeTool.java index 09eb850a3f7..1342a553d89 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/IndexMergeTool.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/IndexMergeTool.java @@ -19,12 +19,12 @@ package org.apache.lucene.misc; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.MergePolicy; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.HardlinkCopyDirectoryWrapper; import org.apache.lucene.util.SuppressForbidden; -import java.io.IOException; import java.nio.file.Paths; /** @@ -34,28 +34,89 @@ import java.nio.file.Paths; @SuppressForbidden(reason = "System.out required: command line tool") public class IndexMergeTool { - public static void main(String[] args) throws IOException { - if (args.length < 3) { - System.err.println("Usage: IndexMergeTool [index3] ..."); - System.exit(1); + static final String USAGE = + "Usage: IndexMergeTool [OPTION...] [index3] ...\n" + + "Merges source indexes 'index1' .. 'indexN' into 'mergedIndex'\n" + + "\n" + + "OPTIONS:\n" + + " -merge-policy ClassName specifies MergePolicy class (must be in CLASSPATH).The default is\n" + + " 'org.apache.lucene.index.TieredMergePolicy.TieredMergePolicy'\n" + + " -max-segments N force-merge's the index to a maximum of N segments. Default is\n" + + " to execute only the merges according to the merge policy.\n" + + " -verbose print additional details.\n"; + + static class Options { + String mergedIndexPath; + String indexPaths[]; + IndexWriterConfig config = new IndexWriterConfig(null).setOpenMode(OpenMode.CREATE); + int maxSegments = 0; + + static Options parse(String args[]) throws ReflectiveOperationException { + Options options = new Options(); + int index = 0; + while (index < args.length) { + if (!args[index].startsWith("-")) { + break; + } + if (args[index] == "--") { + break; + } + switch(args[index]) { + case "-merge-policy": + String clazzName = args[++index]; + Class clazz = Class.forName(clazzName).asSubclass(MergePolicy.class); + options.config.setMergePolicy(clazz.getConstructor().newInstance()); + break; + case "-max-segments": + options.maxSegments = Integer.parseInt(args[++index]); + break; + case "-verbose": + options.config.setInfoStream(System.err); + break; + default: throw new IllegalArgumentException("unrecognized option: '" + args[index] + "'\n" + USAGE); + } + index++; + } + + // process any remaining arguments as the target and source index paths. + int numPaths = args.length - index; + if (numPaths < 3) { + throw new IllegalArgumentException("not enough parameters.\n" + USAGE); + } + + options.mergedIndexPath = args[index]; + options.indexPaths = new String[numPaths - 1]; + System.arraycopy(args, index + 1, options.indexPaths, 0, options.indexPaths.length); + return options; + } + } + + public static void main(String[] args) throws Exception { + Options options = null; + try { + options = Options.parse(args); + } catch (IllegalArgumentException e) { + System.err.println(e.getMessage()); + System.exit(2); } // Try to use hardlinks to source segments, if possible. - Directory mergedIndex = new HardlinkCopyDirectoryWrapper(FSDirectory.open(Paths.get(args[0]))); + Directory mergedIndex = new HardlinkCopyDirectoryWrapper(FSDirectory.open(Paths.get(options.mergedIndexPath))); - IndexWriter writer = new IndexWriter(mergedIndex, - new IndexWriterConfig(null).setOpenMode(OpenMode.CREATE)); - - Directory[] indexes = new Directory[args.length - 1]; - for (int i = 1; i < args.length; i++) { - indexes[i - 1] = FSDirectory.open(Paths.get(args[i])); + Directory[] indexes = new Directory[options.indexPaths.length]; + for (int i = 0; i < indexes.length; i++) { + indexes[i] = FSDirectory.open(Paths.get(options.indexPaths[i])); } + IndexWriter writer = new IndexWriter(mergedIndex, options.config); + System.out.println("Merging..."); writer.addIndexes(indexes); - System.out.println("Full merge..."); - writer.forceMerge(1); + if (options.maxSegments > 0) { + System.out.println("Force-merging to " + options.maxSegments + "..."); + writer.forceMerge(options.maxSegments); + } writer.close(); System.out.println("Done."); } diff --git a/lucene/misc/src/test/org/apache/lucene/misc/TestIndexMergeTool.java b/lucene/misc/src/test/org/apache/lucene/misc/TestIndexMergeTool.java new file mode 100644 index 00000000000..b9183ea75ab --- /dev/null +++ b/lucene/misc/src/test/org/apache/lucene/misc/TestIndexMergeTool.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.misc; + +import org.apache.lucene.index.LogDocMergePolicy; +import org.apache.lucene.misc.IndexMergeTool.Options; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.PrintStreamInfoStream; + +public class TestIndexMergeTool extends LuceneTestCase { + + public void testNoParameters() throws Exception { + expectThrows(IllegalArgumentException.class, () -> { + Options.parse(new String[] {}); + }); + } + + public void testOneParameter() throws Exception { + expectThrows(IllegalArgumentException.class, () -> { + Options.parse(new String[] { "target" }); + }); + } + + public void testTwoParameters() throws Exception { + expectThrows(IllegalArgumentException.class, () -> { + Options.parse(new String[] { "target", "source1" }); + }); + } + + public void testThreeParameters() throws Exception { + Options options = Options.parse(new String[] { "target", "source1", "source2" }); + assertEquals("target", options.mergedIndexPath); + assertArrayEquals(new String[] { "source1", "source2" }, options.indexPaths); + } + + public void testVerboseOption() throws Exception { + Options options = Options.parse(new String[] { "-verbose", "target", "source1", "source2" }); + assertEquals(PrintStreamInfoStream.class, options.config.getInfoStream().getClass()); + } + + public void testMergePolicyOption() throws Exception { + Options options = Options.parse(new String[] { "-merge-policy", LogDocMergePolicy.class.getName(), "target", "source1", "source2" }); + assertEquals(LogDocMergePolicy.class, options.config.getMergePolicy().getClass()); + } + + public void testMaxSegmentsOption() throws Exception { + Options options = Options.parse(new String[] { "-max-segments", "42", "target", "source1", "source2" }); + assertEquals(42, options.maxSegments); + } + +}