LUCENE-4739: fix FST.save/load to work with > 1.1 GB FSTs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1441213 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-01-31 21:51:58 +00:00
parent 51fdde39b7
commit 549855a0d6
4 changed files with 164 additions and 99 deletions

View File

@ -123,6 +123,9 @@ Bug Fixes
* LUCENE-4732: Fixed TermsEnum.seekCeil/seekExact on term vectors.
(Adrien Grand, Robert Muir)
* LUCENE-4739: Fixed bugs that prevented FSTs more than ~1.1GB from
being saved and loaded (Adrien Grand, Mike McCandless)
======================= Lucene 4.1.0 =======================
Changes in backwards compatibility policy

View File

@ -46,7 +46,7 @@ class BytesStore extends DataOutput {
}
/** Pulls bytes from the provided IndexInput. */
public BytesStore(DataInput in, int numBytes, int maxBlockSize) throws IOException {
public BytesStore(DataInput in, long numBytes, int maxBlockSize) throws IOException {
int blockSize = 2;
int blockBits = 1;
while(blockSize < numBytes && blockSize < maxBlockSize) {
@ -56,9 +56,9 @@ class BytesStore extends DataOutput {
this.blockBits = blockBits;
this.blockSize = blockSize;
this.blockMask = blockSize-1;
int left = numBytes;
long left = numBytes;
while(left > 0) {
final int chunk = Math.min(blockSize, left);
final int chunk = (int) Math.min(blockSize, left);
byte[] block = new byte[chunk];
in.readBytes(block, 0, block.length);
blocks.add(block);

View File

@ -27,11 +27,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;
/*
import java.io.Writer;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
*/
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.ByteArrayDataOutput;
@ -41,12 +36,15 @@ import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.fst.Builder.UnCompiledNode;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
//import java.io.Writer;
//import java.io.OutputStreamWriter;
// TODO: break this into WritableFST and ReadOnlyFST.. then
// we can have subclasses of ReadOnlyFST to handle the
@ -276,7 +274,6 @@ public final class FST<T> {
this.outputs = outputs;
this.allowArrayArcs = allowArrayArcs;
version = VERSION_CURRENT;
// 32 KB blocks:
bytes = new BytesStore(bytesPageBits);
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
@ -295,9 +292,22 @@ public final class FST<T> {
nodeRefToAddress = null;
}
public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
/** Load a previously saved FST. */
public FST(DataInput in, Outputs<T> outputs) throws IOException {
this(in, outputs, DEFAULT_MAX_BLOCK_BITS);
}
/** Load a previously saved FST; maxBlockBits allows you to
* control the size of the byte[] pages used to hold the FST bytes. */
public FST(DataInput in, Outputs<T> outputs, int maxBlockBits) throws IOException {
this.outputs = outputs;
if (maxBlockBits < 1 || maxBlockBits > 30) {
throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
}
// NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental):
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET);
@ -345,13 +355,13 @@ public final class FST<T> {
} else {
nodeRefToAddress = null;
}
startNode = in.readVInt();
startNode = in.readVLong();
nodeCount = in.readVLong();
arcCount = in.readVLong();
arcWithOutputCount = in.readVLong();
int numBytes = in.readVInt();
bytes = new BytesStore(in, numBytes, Integer.MAX_VALUE);
long numBytes = in.readVLong();
bytes = new BytesStore(in, numBytes, 1<<maxBlockBits);
NO_OUTPUT = outputs.getNoOutput();

View File

@ -20,10 +20,16 @@ package org.apache.lucene.util.fst;
import java.util.Arrays;
import java.util.Random;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TimeUnits;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.packed.PackedInts;
import org.junit.Ignore;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
@ -39,6 +45,8 @@ public class Test2BFST extends LuceneTestCase {
IntsRef input = new IntsRef(ints, 0, ints.length);
long seed = random().nextLong();
Directory dir = new MMapDirectory(_TestUtil.getTempDir("2BFST"));
for(int doPackIter=0;doPackIter<2;doPackIter++) {
boolean doPack = doPackIter == 1;
@ -72,42 +80,56 @@ public class Test2BFST extends LuceneTestCase {
FST<Object> fst = b.finish();
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
Arrays.fill(ints2, 0);
r = new Random(seed);
Arrays.fill(ints2, 0);
r = new Random(seed);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
for(int j=10;j<ints2.length;j++) {
ints2[j] = r.nextInt(256);
}
assertEquals(NO_OUTPUT, Util.get(fst, input2));
nextInput(r, ints2);
}
for(int j=10;j<ints2.length;j++) {
ints2[j] = r.nextInt(256);
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<Object>(fst);
Arrays.fill(ints2, 0);
r = new Random(seed);
int upto = 0;
while(true) {
IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
if (pair == null) {
break;
}
for(int j=10;j<ints2.length;j++) {
ints2[j] = r.nextInt(256);
}
assertEquals(input2, pair.input);
assertEquals(NO_OUTPUT, pair.output);
upto++;
nextInput(r, ints2);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<Object>(in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
assertEquals(NO_OUTPUT, Util.get(fst, input2));
nextInput(r, ints2);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<Object>(fst);
Arrays.fill(ints2, 0);
r = new Random(seed);
int upto = 0;
while(true) {
IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
if (pair == null) {
break;
}
for(int j=10;j<ints2.length;j++) {
ints2[j] = r.nextInt(256);
}
assertEquals(input2, pair.input);
assertEquals(NO_OUTPUT, pair.output);
upto++;
nextInput(r, ints2);
}
assertEquals(count, upto);
}
// Build FST w/ ByteSequenceOutputs and stop when FST
@ -138,39 +160,53 @@ public class Test2BFST extends LuceneTestCase {
}
FST<BytesRef> fst = b.finish();
for(int verify=0;verify<2;verify++) {
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
r = new Random(seed);
Arrays.fill(ints, 0);
r = new Random(seed);
Arrays.fill(ints, 0);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
r.nextBytes(outputBytes);
assertEquals(output, Util.get(fst, input));
nextInput(r, ints);
}
r.nextBytes(outputBytes);
assertEquals(output, Util.get(fst, input));
nextInput(r, ints);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
while(true) {
IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
if (pair == null) {
break;
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
while(true) {
IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
if (pair == null) {
break;
}
assertEquals(input, pair.input);
r.nextBytes(outputBytes);
assertEquals(output, pair.output);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<BytesRef>(in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
assertEquals(input, pair.input);
r.nextBytes(outputBytes);
assertEquals(output, pair.output);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
}
// Build FST w/ PositiveIntOutputs and stop when FST
@ -202,46 +238,62 @@ public class Test2BFST extends LuceneTestCase {
FST<Long> fst = b.finish();
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
for(int verify=0;verify<2;verify++) {
Arrays.fill(ints, 0);
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
output = 1;
r = new Random(seed);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
Arrays.fill(ints, 0);
output = 1;
r = new Random(seed);
for(int i=0;i<count;i++) {
if (i % 1000000 == 0) {
System.out.println(i + "...: ");
}
// forward lookup:
assertEquals(output, Util.get(fst, input).longValue());
// reverse lookup:
assertEquals(input, Util.getByOutput(fst, output));
output += 1 + r.nextInt(10);
nextInput(r, ints);
}
// forward lookup:
assertEquals(output, Util.get(fst, input).longValue());
// reverse lookup:
assertEquals(input, Util.getByOutput(fst, output));
output += 1 + r.nextInt(10);
nextInput(r, ints);
}
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
System.out.println("\nTEST: enum all input/outputs");
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
output = 1;
while(true) {
IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
if (pair == null) {
break;
Arrays.fill(ints, 0);
r = new Random(seed);
int upto = 0;
output = 1;
while(true) {
IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
if (pair == null) {
break;
}
assertEquals(input, pair.input);
assertEquals(output, pair.output.longValue());
output += 1 + r.nextInt(10);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
if (verify == 0) {
System.out.println("\nTEST: save/load FST and re-verify");
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
fst.save(out);
out.close();
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
fst = new FST<Long>(in, outputs);
in.close();
} else {
dir.deleteFile("fst");
}
assertEquals(input, pair.input);
assertEquals(output, pair.output.longValue());
output += 1 + r.nextInt(10);
upto++;
nextInput(r, ints);
}
assertEquals(count, upto);
}
}
dir.close();
}
private void nextInput(Random r, int[] ints) {