diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index bc98e3c13d4..7264c4ecf0e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -19,6 +19,9 @@ Changes in backwards compatibility policy (Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless, Robert Muir) +* LUCENE-4677: unpacked FSTs now use vInt to encode the node target, + to reduce their size (Mike McCandless) + * LUCENE-4678: FST now uses a paged byte[] structure instead of a single byte[] internally, to avoid large memory spikes during building (James Dyer, Mike McCandless) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index c1933bbd8b2..58ec0cf67da 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -27,6 +27,11 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.HashMap; import java.util.Map; +/* +import java.io.Writer; +import java.io.OutputStreamWriter; +import java.io.FileOutputStream; +*/ import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataInput; @@ -124,7 +129,10 @@ public final class FST { /** Added optional packed format. */ private final static int VERSION_PACKED = 3; - private final static int VERSION_CURRENT = VERSION_PACKED; + /** Changed from int to vInt for encoding arc targets. */ + private final static int VERSION_VINT_TARGET = 4; + + private final static int VERSION_CURRENT = VERSION_VINT_TARGET; // Never serialized; just used to represent the virtual // final node w/ no arcs: @@ -259,12 +267,15 @@ public final class FST { // clear early on: private GrowableWriter inCounts; + private final int version; + // make a new empty FST, for building; Builder invokes // this ctor FST(INPUT_TYPE inputType, Outputs outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) { this.inputType = inputType; this.outputs = outputs; this.allowArrayArcs = allowArrayArcs; + version = VERSION_CURRENT; // 32 KB blocks: bytes = new BytesStore(15); // pad: ensure no node gets address 0 which is reserved to mean @@ -289,7 +300,7 @@ public final class FST { this.outputs = outputs; // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): - CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED); + version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET); packed = in.readByte() == 1; if (in.readByte() == 1) { // accepts empty string @@ -350,6 +361,15 @@ public final class FST { // building; we need to break out mutable FST from // immutable allowArrayArcs = false; + + /* + if (bytes.length == 665) { + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); + Util.toDot(this, w, false, false); + w.close(); + System.out.println("Wrote FST to out.dot"); + } + */ } public INPUT_TYPE getInputType() { @@ -661,7 +681,7 @@ public final class FST { if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { assert target.node > 0; //System.out.println(" write target"); - bytes.writeInt(target.node); + bytes.writeVInt(target.node); } // just write the arcs "like normal" on first pass, @@ -800,12 +820,10 @@ public final class FST { } if (arc.flag(BIT_STOP_NODE)) { } else if (arc.flag(BIT_TARGET_NEXT)) { + } else if (packed) { + in.readVInt(); } else { - if (packed) { - in.readVInt(); - } else { - in.skipBytes(4); - } + readUnpackedNodeTarget(in); } arc.flags = in.readByte(); } @@ -819,6 +837,16 @@ public final class FST { } } + private int readUnpackedNodeTarget(BytesReader in) throws IOException { + int target; + if (version < VERSION_VINT_TARGET) { + target = in.readInt(); + } else { + target = in.readVInt(); + } + return target; + } + /** * Follow the follow arc and read the first arc of its target; * this changes the provided arc (2nd arg) in-place and returns @@ -920,8 +948,10 @@ public final class FST { final byte b = in.readByte(); if (b == ARCS_AS_FIXED_ARRAY) { - //System.out.println(" nextArc fake array"); + //System.out.println(" nextArc fixed array"); in.readVInt(); + + // Skip bytesPerArc: if (packed) { in.readVInt(); } else { @@ -1024,7 +1054,7 @@ public final class FST { //System.out.println(" abs code=" + code); } } else { - arc.target = in.readInt(); + arc.target = readUnpackedNodeTarget(in); } arc.nextArc = in.getPosition(); } @@ -1147,7 +1177,7 @@ public final class FST { if (packed) { in.readVInt(); } else { - in.readInt(); + readUnpackedNodeTarget(in); } } @@ -1354,6 +1384,7 @@ public final class FST { // Creates a packed FST private FST(INPUT_TYPE inputType, Outputs outputs) { + version = VERSION_CURRENT; packed = true; this.inputType = inputType; // 32 KB blocks: diff --git a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index d07c77359fa..87727db3d76 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -58,7 +58,6 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -100,7 +99,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { createIndex("index.nocfs", false, false); } */ - + /* // These are only needed for the special upgrade test to verify // that also single-segment indexes are correctly upgraded by IndexUpgrader. @@ -116,8 +115,40 @@ public class TestBackwardsCompatibility extends LuceneTestCase { } */ + + /* + public void testCreateMoreTermsIndex() throws Exception { + // we use a real directory name that is not cleaned up, + // because this method is only used to create backwards + // indexes: + File indexDir = new File("moreterms"); + _TestUtil.rmDir(indexDir); + Directory dir = newFSDirectory(indexDir); + + LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy(); + mp.setUseCompoundFile(false); + mp.setNoCFSRatio(1.0); + mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY); + // TODO: remove randomness + IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())) + .setMergePolicy(mp); + conf.setCodec(Codec.forName("Lucene40")); + IndexWriter writer = new IndexWriter(dir, conf); + LineFileDocs docs = new LineFileDocs(null, true); + for(int i=0;i<50;i++) { + writer.addDocument(docs.nextDoc()); + } + writer.close(); + dir.close(); + + // Gives you time to copy the index out!: (there is also + // a test option to not remove temp dir...): + Thread.sleep(100000); + } + */ + final static String[] oldNames = {"40.cfs", - "40.nocfs", + "40.nocfs", }; final String[] unsupportedNames = {"19.cfs", @@ -145,7 +176,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { }; final static String[] oldSingleSegmentNames = {"40.optimized.cfs", - "40.optimized.nocfs", + "40.optimized.nocfs", }; static Map oldIndexDirs; @@ -908,4 +939,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase { dir.close(); } } + + public static final String moreTermsIndex = "moreterms.40.zip"; + + public void testMoreTerms() throws Exception { + File oldIndexDir = _TestUtil.getTempDir("moreterms"); + _TestUtil.unzip(getDataFile(moreTermsIndex), oldIndexDir); + Directory dir = newFSDirectory(oldIndexDir); + // TODO: more tests + _TestUtil.checkIndex(dir); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/moreterms.40.zip b/lucene/core/src/test/org/apache/lucene/index/moreterms.40.zip new file mode 100644 index 00000000000..53ad7ce31e9 Binary files /dev/null and b/lucene/core/src/test/org/apache/lucene/index/moreterms.40.zip differ diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java index 2ff3ee4491c..4bf3a379016 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java @@ -289,6 +289,13 @@ public class FSTTester { null, willRewrite, true); + if (LuceneTestCase.VERBOSE) { + if (willRewrite) { + System.out.println("TEST: packed FST"); + } else { + System.out.println("TEST: non-packed FST"); + } + } for(InputOutput pair : pairs) { if (pair.output instanceof List) {