mirror of https://github.com/apache/lucene.git
LUCENE-4739: fix FST.save/load to work with > 1.1 GB FSTs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1441213 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
51fdde39b7
commit
549855a0d6
|
@ -123,6 +123,9 @@ Bug Fixes
|
|||
* LUCENE-4732: Fixed TermsEnum.seekCeil/seekExact on term vectors.
|
||||
(Adrien Grand, Robert Muir)
|
||||
|
||||
* LUCENE-4739: Fixed bugs that prevented FSTs more than ~1.1GB from
|
||||
being saved and loaded (Adrien Grand, Mike McCandless)
|
||||
|
||||
======================= Lucene 4.1.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
|
|
@ -46,7 +46,7 @@ class BytesStore extends DataOutput {
|
|||
}
|
||||
|
||||
/** Pulls bytes from the provided IndexInput. */
|
||||
public BytesStore(DataInput in, int numBytes, int maxBlockSize) throws IOException {
|
||||
public BytesStore(DataInput in, long numBytes, int maxBlockSize) throws IOException {
|
||||
int blockSize = 2;
|
||||
int blockBits = 1;
|
||||
while(blockSize < numBytes && blockSize < maxBlockSize) {
|
||||
|
@ -56,9 +56,9 @@ class BytesStore extends DataOutput {
|
|||
this.blockBits = blockBits;
|
||||
this.blockSize = blockSize;
|
||||
this.blockMask = blockSize-1;
|
||||
int left = numBytes;
|
||||
long left = numBytes;
|
||||
while(left > 0) {
|
||||
final int chunk = Math.min(blockSize, left);
|
||||
final int chunk = (int) Math.min(blockSize, left);
|
||||
byte[] block = new byte[chunk];
|
||||
in.readBytes(block, 0, block.length);
|
||||
blocks.add(block);
|
||||
|
|
|
@ -27,11 +27,6 @@ import java.io.InputStream;
|
|||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
/*
|
||||
import java.io.Writer;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.FileOutputStream;
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
|
@ -41,12 +36,15 @@ import org.apache.lucene.store.InputStreamDataInput;
|
|||
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.fst.Builder.UnCompiledNode;
|
||||
import org.apache.lucene.util.packed.GrowableWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
//import java.io.Writer;
|
||||
//import java.io.OutputStreamWriter;
|
||||
|
||||
// TODO: break this into WritableFST and ReadOnlyFST.. then
|
||||
// we can have subclasses of ReadOnlyFST to handle the
|
||||
|
@ -276,7 +274,6 @@ public final class FST<T> {
|
|||
this.outputs = outputs;
|
||||
this.allowArrayArcs = allowArrayArcs;
|
||||
version = VERSION_CURRENT;
|
||||
// 32 KB blocks:
|
||||
bytes = new BytesStore(bytesPageBits);
|
||||
// pad: ensure no node gets address 0 which is reserved to mean
|
||||
// the stop state w/ no arcs
|
||||
|
@ -295,9 +292,22 @@ public final class FST<T> {
|
|||
nodeRefToAddress = null;
|
||||
}
|
||||
|
||||
public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
|
||||
|
||||
/** Load a previously saved FST. */
|
||||
public FST(DataInput in, Outputs<T> outputs) throws IOException {
|
||||
this(in, outputs, DEFAULT_MAX_BLOCK_BITS);
|
||||
}
|
||||
|
||||
/** Load a previously saved FST; maxBlockBits allows you to
|
||||
* control the size of the byte[] pages used to hold the FST bytes. */
|
||||
public FST(DataInput in, Outputs<T> outputs, int maxBlockBits) throws IOException {
|
||||
this.outputs = outputs;
|
||||
|
||||
if (maxBlockBits < 1 || maxBlockBits > 30) {
|
||||
throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
|
||||
}
|
||||
|
||||
// NOTE: only reads most recent format; we don't have
|
||||
// back-compat promise for FSTs (they are experimental):
|
||||
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET);
|
||||
|
@ -345,13 +355,13 @@ public final class FST<T> {
|
|||
} else {
|
||||
nodeRefToAddress = null;
|
||||
}
|
||||
startNode = in.readVInt();
|
||||
startNode = in.readVLong();
|
||||
nodeCount = in.readVLong();
|
||||
arcCount = in.readVLong();
|
||||
arcWithOutputCount = in.readVLong();
|
||||
|
||||
int numBytes = in.readVInt();
|
||||
bytes = new BytesStore(in, numBytes, Integer.MAX_VALUE);
|
||||
long numBytes = in.readVLong();
|
||||
bytes = new BytesStore(in, numBytes, 1<<maxBlockBits);
|
||||
|
||||
NO_OUTPUT = outputs.getNoOutput();
|
||||
|
||||
|
|
|
@ -20,10 +20,16 @@ package org.apache.lucene.util.fst;
|
|||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.MMapDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TimeUnits;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.junit.Ignore;
|
||||
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||
|
@ -39,6 +45,8 @@ public class Test2BFST extends LuceneTestCase {
|
|||
IntsRef input = new IntsRef(ints, 0, ints.length);
|
||||
long seed = random().nextLong();
|
||||
|
||||
Directory dir = new MMapDirectory(_TestUtil.getTempDir("2BFST"));
|
||||
|
||||
for(int doPackIter=0;doPackIter<2;doPackIter++) {
|
||||
boolean doPack = doPackIter == 1;
|
||||
|
||||
|
@ -72,42 +80,56 @@ public class Test2BFST extends LuceneTestCase {
|
|||
|
||||
FST<Object> fst = b.finish();
|
||||
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
|
||||
for(int verify=0;verify<2;verify++) {
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
|
||||
|
||||
Arrays.fill(ints2, 0);
|
||||
r = new Random(seed);
|
||||
Arrays.fill(ints2, 0);
|
||||
r = new Random(seed);
|
||||
|
||||
for(int i=0;i<count;i++) {
|
||||
if (i % 1000000 == 0) {
|
||||
System.out.println(i + "...: ");
|
||||
for(int i=0;i<count;i++) {
|
||||
if (i % 1000000 == 0) {
|
||||
System.out.println(i + "...: ");
|
||||
}
|
||||
for(int j=10;j<ints2.length;j++) {
|
||||
ints2[j] = r.nextInt(256);
|
||||
}
|
||||
assertEquals(NO_OUTPUT, Util.get(fst, input2));
|
||||
nextInput(r, ints2);
|
||||
}
|
||||
for(int j=10;j<ints2.length;j++) {
|
||||
ints2[j] = r.nextInt(256);
|
||||
|
||||
System.out.println("\nTEST: enum all input/outputs");
|
||||
IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<Object>(fst);
|
||||
|
||||
Arrays.fill(ints2, 0);
|
||||
r = new Random(seed);
|
||||
int upto = 0;
|
||||
while(true) {
|
||||
IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
|
||||
if (pair == null) {
|
||||
break;
|
||||
}
|
||||
for(int j=10;j<ints2.length;j++) {
|
||||
ints2[j] = r.nextInt(256);
|
||||
}
|
||||
assertEquals(input2, pair.input);
|
||||
assertEquals(NO_OUTPUT, pair.output);
|
||||
upto++;
|
||||
nextInput(r, ints2);
|
||||
}
|
||||
assertEquals(count, upto);
|
||||
|
||||
if (verify == 0) {
|
||||
System.out.println("\nTEST: save/load FST and re-verify");
|
||||
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
|
||||
fst.save(out);
|
||||
out.close();
|
||||
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
|
||||
fst = new FST<Object>(in, outputs);
|
||||
in.close();
|
||||
} else {
|
||||
dir.deleteFile("fst");
|
||||
}
|
||||
assertEquals(NO_OUTPUT, Util.get(fst, input2));
|
||||
nextInput(r, ints2);
|
||||
}
|
||||
|
||||
System.out.println("\nTEST: enum all input/outputs");
|
||||
IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<Object>(fst);
|
||||
|
||||
Arrays.fill(ints2, 0);
|
||||
r = new Random(seed);
|
||||
int upto = 0;
|
||||
while(true) {
|
||||
IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
|
||||
if (pair == null) {
|
||||
break;
|
||||
}
|
||||
for(int j=10;j<ints2.length;j++) {
|
||||
ints2[j] = r.nextInt(256);
|
||||
}
|
||||
assertEquals(input2, pair.input);
|
||||
assertEquals(NO_OUTPUT, pair.output);
|
||||
upto++;
|
||||
nextInput(r, ints2);
|
||||
}
|
||||
assertEquals(count, upto);
|
||||
}
|
||||
|
||||
// Build FST w/ ByteSequenceOutputs and stop when FST
|
||||
|
@ -138,39 +160,53 @@ public class Test2BFST extends LuceneTestCase {
|
|||
}
|
||||
|
||||
FST<BytesRef> fst = b.finish();
|
||||
for(int verify=0;verify<2;verify++) {
|
||||
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
|
||||
|
||||
r = new Random(seed);
|
||||
Arrays.fill(ints, 0);
|
||||
r = new Random(seed);
|
||||
Arrays.fill(ints, 0);
|
||||
|
||||
for(int i=0;i<count;i++) {
|
||||
if (i % 1000000 == 0) {
|
||||
System.out.println(i + "...: ");
|
||||
for(int i=0;i<count;i++) {
|
||||
if (i % 1000000 == 0) {
|
||||
System.out.println(i + "...: ");
|
||||
}
|
||||
r.nextBytes(outputBytes);
|
||||
assertEquals(output, Util.get(fst, input));
|
||||
nextInput(r, ints);
|
||||
}
|
||||
r.nextBytes(outputBytes);
|
||||
assertEquals(output, Util.get(fst, input));
|
||||
nextInput(r, ints);
|
||||
}
|
||||
|
||||
System.out.println("\nTEST: enum all input/outputs");
|
||||
IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);
|
||||
System.out.println("\nTEST: enum all input/outputs");
|
||||
IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);
|
||||
|
||||
Arrays.fill(ints, 0);
|
||||
r = new Random(seed);
|
||||
int upto = 0;
|
||||
while(true) {
|
||||
IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
|
||||
if (pair == null) {
|
||||
break;
|
||||
Arrays.fill(ints, 0);
|
||||
r = new Random(seed);
|
||||
int upto = 0;
|
||||
while(true) {
|
||||
IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
|
||||
if (pair == null) {
|
||||
break;
|
||||
}
|
||||
assertEquals(input, pair.input);
|
||||
r.nextBytes(outputBytes);
|
||||
assertEquals(output, pair.output);
|
||||
upto++;
|
||||
nextInput(r, ints);
|
||||
}
|
||||
assertEquals(count, upto);
|
||||
|
||||
if (verify == 0) {
|
||||
System.out.println("\nTEST: save/load FST and re-verify");
|
||||
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
|
||||
fst.save(out);
|
||||
out.close();
|
||||
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
|
||||
fst = new FST<BytesRef>(in, outputs);
|
||||
in.close();
|
||||
} else {
|
||||
dir.deleteFile("fst");
|
||||
}
|
||||
assertEquals(input, pair.input);
|
||||
r.nextBytes(outputBytes);
|
||||
assertEquals(output, pair.output);
|
||||
upto++;
|
||||
nextInput(r, ints);
|
||||
}
|
||||
assertEquals(count, upto);
|
||||
}
|
||||
|
||||
// Build FST w/ PositiveIntOutputs and stop when FST
|
||||
|
@ -202,46 +238,62 @@ public class Test2BFST extends LuceneTestCase {
|
|||
|
||||
FST<Long> fst = b.finish();
|
||||
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
|
||||
for(int verify=0;verify<2;verify++) {
|
||||
|
||||
Arrays.fill(ints, 0);
|
||||
System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount=" + fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
|
||||
|
||||
output = 1;
|
||||
r = new Random(seed);
|
||||
for(int i=0;i<count;i++) {
|
||||
if (i % 1000000 == 0) {
|
||||
System.out.println(i + "...: ");
|
||||
Arrays.fill(ints, 0);
|
||||
|
||||
output = 1;
|
||||
r = new Random(seed);
|
||||
for(int i=0;i<count;i++) {
|
||||
if (i % 1000000 == 0) {
|
||||
System.out.println(i + "...: ");
|
||||
}
|
||||
|
||||
// forward lookup:
|
||||
assertEquals(output, Util.get(fst, input).longValue());
|
||||
// reverse lookup:
|
||||
assertEquals(input, Util.getByOutput(fst, output));
|
||||
output += 1 + r.nextInt(10);
|
||||
nextInput(r, ints);
|
||||
}
|
||||
|
||||
// forward lookup:
|
||||
assertEquals(output, Util.get(fst, input).longValue());
|
||||
// reverse lookup:
|
||||
assertEquals(input, Util.getByOutput(fst, output));
|
||||
output += 1 + r.nextInt(10);
|
||||
nextInput(r, ints);
|
||||
}
|
||||
System.out.println("\nTEST: enum all input/outputs");
|
||||
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
|
||||
|
||||
System.out.println("\nTEST: enum all input/outputs");
|
||||
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
|
||||
|
||||
Arrays.fill(ints, 0);
|
||||
r = new Random(seed);
|
||||
int upto = 0;
|
||||
output = 1;
|
||||
while(true) {
|
||||
IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
|
||||
if (pair == null) {
|
||||
break;
|
||||
Arrays.fill(ints, 0);
|
||||
r = new Random(seed);
|
||||
int upto = 0;
|
||||
output = 1;
|
||||
while(true) {
|
||||
IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
|
||||
if (pair == null) {
|
||||
break;
|
||||
}
|
||||
assertEquals(input, pair.input);
|
||||
assertEquals(output, pair.output.longValue());
|
||||
output += 1 + r.nextInt(10);
|
||||
upto++;
|
||||
nextInput(r, ints);
|
||||
}
|
||||
assertEquals(count, upto);
|
||||
|
||||
if (verify == 0) {
|
||||
System.out.println("\nTEST: save/load FST and re-verify");
|
||||
IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
|
||||
fst.save(out);
|
||||
out.close();
|
||||
IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
|
||||
fst = new FST<Long>(in, outputs);
|
||||
in.close();
|
||||
} else {
|
||||
dir.deleteFile("fst");
|
||||
}
|
||||
assertEquals(input, pair.input);
|
||||
assertEquals(output, pair.output.longValue());
|
||||
output += 1 + r.nextInt(10);
|
||||
upto++;
|
||||
nextInput(r, ints);
|
||||
}
|
||||
assertEquals(count, upto);
|
||||
}
|
||||
}
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void nextInput(Random r, int[] ints) {
|
||||
|
|
Loading…
Reference in New Issue