From 549855a0d659c0e237ba3f89124c8ef2c9da43ad Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Thu, 31 Jan 2013 21:51:58 +0000 Subject: [PATCH] LUCENE-4739: fix FST.save/load to work with > 1.1 GB FSTs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1441213 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../apache/lucene/util/fst/BytesStore.java | 6 +- .../java/org/apache/lucene/util/fst/FST.java | 28 ++- .../org/apache/lucene/util/fst/Test2BFST.java | 226 +++++++++++------- 4 files changed, 164 insertions(+), 99 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 3abd8b9bb5c..cf60630e67b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -123,6 +123,9 @@ Bug Fixes * LUCENE-4732: Fixed TermsEnum.seekCeil/seekExact on term vectors. (Adrien Grand, Robert Muir) +* LUCENE-4739: Fixed bugs that prevented FSTs more than ~1.1GB from + being saved and loaded (Adrien Grand, Mike McCandless) + ======================= Lucene 4.1.0 ======================= Changes in backwards compatibility policy diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java index 504e8b7a11f..4b20947e3a7 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java @@ -46,7 +46,7 @@ class BytesStore extends DataOutput { } /** Pulls bytes from the provided IndexInput. */ - public BytesStore(DataInput in, int numBytes, int maxBlockSize) throws IOException { + public BytesStore(DataInput in, long numBytes, int maxBlockSize) throws IOException { int blockSize = 2; int blockBits = 1; while(blockSize < numBytes && blockSize < maxBlockSize) { @@ -56,9 +56,9 @@ class BytesStore extends DataOutput { this.blockBits = blockBits; this.blockSize = blockSize; this.blockMask = blockSize-1; - int left = numBytes; + long left = numBytes; while(left > 0) { - final int chunk = Math.min(blockSize, left); + final int chunk = (int) Math.min(blockSize, left); byte[] block = new byte[chunk]; in.readBytes(block, 0, block.length); blocks.add(block); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index e5c45a97f6d..df52cd526d2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -27,11 +27,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.HashMap; import java.util.Map; -/* -import java.io.Writer; -import java.io.OutputStreamWriter; -import java.io.FileOutputStream; -*/ import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.ByteArrayDataOutput; @@ -41,12 +36,15 @@ import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Constants; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.fst.Builder.UnCompiledNode; import org.apache.lucene.util.packed.GrowableWriter; import org.apache.lucene.util.packed.PackedInts; +//import java.io.Writer; +//import java.io.OutputStreamWriter; // TODO: break this into WritableFST and ReadOnlyFST.. then // we can have subclasses of ReadOnlyFST to handle the @@ -276,7 +274,6 @@ public final class FST { this.outputs = outputs; this.allowArrayArcs = allowArrayArcs; version = VERSION_CURRENT; - // 32 KB blocks: bytes = new BytesStore(bytesPageBits); // pad: ensure no node gets address 0 which is reserved to mean // the stop state w/ no arcs @@ -295,9 +292,22 @@ public final class FST { nodeRefToAddress = null; } + public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28; + /** Load a previously saved FST. */ public FST(DataInput in, Outputs outputs) throws IOException { + this(in, outputs, DEFAULT_MAX_BLOCK_BITS); + } + + /** Load a previously saved FST; maxBlockBits allows you to + * control the size of the byte[] pages used to hold the FST bytes. */ + public FST(DataInput in, Outputs outputs, int maxBlockBits) throws IOException { this.outputs = outputs; + + if (maxBlockBits < 1 || maxBlockBits > 30) { + throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits); + } + // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET); @@ -345,13 +355,13 @@ public final class FST { } else { nodeRefToAddress = null; } - startNode = in.readVInt(); + startNode = in.readVLong(); nodeCount = in.readVLong(); arcCount = in.readVLong(); arcWithOutputCount = in.readVLong(); - int numBytes = in.readVInt(); - bytes = new BytesStore(in, numBytes, Integer.MAX_VALUE); + long numBytes = in.readVLong(); + bytes = new BytesStore(in, numBytes, 1< fst = b.finish(); - System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + for(int verify=0;verify<2;verify++) { + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); - Arrays.fill(ints2, 0); - r = new Random(seed); + Arrays.fill(ints2, 0); + r = new Random(seed); - for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); + + Arrays.fill(ints2, 0); + r = new Random(seed); + int upto = 0; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + for(int j=10;j(in, outputs); + in.close(); + } else { + dir.deleteFile("fst"); } - assertEquals(NO_OUTPUT, Util.get(fst, input2)); - nextInput(r, ints2); } - - System.out.println("\nTEST: enum all input/outputs"); - IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); - - Arrays.fill(ints2, 0); - r = new Random(seed); - int upto = 0; - while(true) { - IntsRefFSTEnum.InputOutput pair = fstEnum.next(); - if (pair == null) { - break; - } - for(int j=10;j fst = b.finish(); + for(int verify=0;verify<2;verify++) { - System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); - r = new Random(seed); - Arrays.fill(ints, 0); + r = new Random(seed); + Arrays.fill(ints, 0); - for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); + System.out.println("\nTEST: enum all input/outputs"); + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); - Arrays.fill(ints, 0); - r = new Random(seed); - int upto = 0; - while(true) { - IntsRefFSTEnum.InputOutput pair = fstEnum.next(); - if (pair == null) { - break; + Arrays.fill(ints, 0); + r = new Random(seed); + int upto = 0; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + assertEquals(input, pair.input); + r.nextBytes(outputBytes); + assertEquals(output, pair.output); + upto++; + nextInput(r, ints); + } + assertEquals(count, upto); + + if (verify == 0) { + System.out.println("\nTEST: save/load FST and re-verify"); + IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT); + fst.save(out); + out.close(); + IndexInput in = dir.openInput("fst", IOContext.DEFAULT); + fst = new FST(in, outputs); + in.close(); + } else { + dir.deleteFile("fst"); } - assertEquals(input, pair.input); - r.nextBytes(outputBytes); - assertEquals(output, pair.output); - upto++; - nextInput(r, ints); } - assertEquals(count, upto); } // Build FST w/ PositiveIntOutputs and stop when FST @@ -202,46 +238,62 @@ public class Test2BFST extends LuceneTestCase { FST fst = b.finish(); - System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); + for(int verify=0;verify<2;verify++) { - Arrays.fill(ints, 0); + System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]"); - output = 1; - r = new Random(seed); - for(int i=0;i fstEnum = new IntsRefFSTEnum(fst); - System.out.println("\nTEST: enum all input/outputs"); - IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); - - Arrays.fill(ints, 0); - r = new Random(seed); - int upto = 0; - output = 1; - while(true) { - IntsRefFSTEnum.InputOutput pair = fstEnum.next(); - if (pair == null) { - break; + Arrays.fill(ints, 0); + r = new Random(seed); + int upto = 0; + output = 1; + while(true) { + IntsRefFSTEnum.InputOutput pair = fstEnum.next(); + if (pair == null) { + break; + } + assertEquals(input, pair.input); + assertEquals(output, pair.output.longValue()); + output += 1 + r.nextInt(10); + upto++; + nextInput(r, ints); + } + assertEquals(count, upto); + + if (verify == 0) { + System.out.println("\nTEST: save/load FST and re-verify"); + IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT); + fst.save(out); + out.close(); + IndexInput in = dir.openInput("fst", IOContext.DEFAULT); + fst = new FST(in, outputs); + in.close(); + } else { + dir.deleteFile("fst"); } - assertEquals(input, pair.input); - assertEquals(output, pair.output.longValue()); - output += 1 + r.nextInt(10); - upto++; - nextInput(r, ints); } - assertEquals(count, upto); } } + dir.close(); } private void nextInput(Random r, int[] ints) {