LUCENE-4677: use vInt not int to encode arc's target address in un-packed FSTs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1432466 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-01-12 16:31:28 +00:00
parent 2220c6e8ba
commit d578775e8f
5 changed files with 98 additions and 15 deletions

View File

@ -19,6 +19,9 @@ Changes in backwards compatibility policy
(Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless, (Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless,
Robert Muir) Robert Muir)
* LUCENE-4677: unpacked FSTs now use vInt to encode the node target,
to reduce their size (Mike McCandless)
* LUCENE-4678: FST now uses a paged byte[] structure instead of a * LUCENE-4678: FST now uses a paged byte[] structure instead of a
single byte[] internally, to avoid large memory spikes during single byte[] internally, to avoid large memory spikes during
building (James Dyer, Mike McCandless) building (James Dyer, Mike McCandless)

View File

@ -27,6 +27,11 @@ import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
/*
import java.io.Writer;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
*/
import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataInput;
@ -124,7 +129,10 @@ public final class FST<T> {
/** Added optional packed format. */ /** Added optional packed format. */
private final static int VERSION_PACKED = 3; private final static int VERSION_PACKED = 3;
private final static int VERSION_CURRENT = VERSION_PACKED; /** Changed from int to vInt for encoding arc targets. */
private final static int VERSION_VINT_TARGET = 4;
private final static int VERSION_CURRENT = VERSION_VINT_TARGET;
// Never serialized; just used to represent the virtual // Never serialized; just used to represent the virtual
// final node w/ no arcs: // final node w/ no arcs:
@ -259,12 +267,15 @@ public final class FST<T> {
// clear early on: // clear early on:
private GrowableWriter inCounts; private GrowableWriter inCounts;
private final int version;
// make a new empty FST, for building; Builder invokes // make a new empty FST, for building; Builder invokes
// this ctor // this ctor
FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) { FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) {
this.inputType = inputType; this.inputType = inputType;
this.outputs = outputs; this.outputs = outputs;
this.allowArrayArcs = allowArrayArcs; this.allowArrayArcs = allowArrayArcs;
version = VERSION_CURRENT;
// 32 KB blocks: // 32 KB blocks:
bytes = new BytesStore(15); bytes = new BytesStore(15);
// pad: ensure no node gets address 0 which is reserved to mean // pad: ensure no node gets address 0 which is reserved to mean
@ -289,7 +300,7 @@ public final class FST<T> {
this.outputs = outputs; this.outputs = outputs;
// NOTE: only reads most recent format; we don't have // NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental): // back-compat promise for FSTs (they are experimental):
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED); version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET);
packed = in.readByte() == 1; packed = in.readByte() == 1;
if (in.readByte() == 1) { if (in.readByte() == 1) {
// accepts empty string // accepts empty string
@ -350,6 +361,15 @@ public final class FST<T> {
// building; we need to break out mutable FST from // building; we need to break out mutable FST from
// immutable // immutable
allowArrayArcs = false; allowArrayArcs = false;
/*
if (bytes.length == 665) {
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
Util.toDot(this, w, false, false);
w.close();
System.out.println("Wrote FST to out.dot");
}
*/
} }
public INPUT_TYPE getInputType() { public INPUT_TYPE getInputType() {
@ -661,7 +681,7 @@ public final class FST<T> {
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
assert target.node > 0; assert target.node > 0;
//System.out.println(" write target"); //System.out.println(" write target");
bytes.writeInt(target.node); bytes.writeVInt(target.node);
} }
// just write the arcs "like normal" on first pass, // just write the arcs "like normal" on first pass,
@ -800,12 +820,10 @@ public final class FST<T> {
} }
if (arc.flag(BIT_STOP_NODE)) { if (arc.flag(BIT_STOP_NODE)) {
} else if (arc.flag(BIT_TARGET_NEXT)) { } else if (arc.flag(BIT_TARGET_NEXT)) {
} else { } else if (packed) {
if (packed) {
in.readVInt(); in.readVInt();
} else { } else {
in.skipBytes(4); readUnpackedNodeTarget(in);
}
} }
arc.flags = in.readByte(); arc.flags = in.readByte();
} }
@ -819,6 +837,16 @@ public final class FST<T> {
} }
} }
private int readUnpackedNodeTarget(BytesReader in) throws IOException {
int target;
if (version < VERSION_VINT_TARGET) {
target = in.readInt();
} else {
target = in.readVInt();
}
return target;
}
/** /**
* Follow the <code>follow</code> arc and read the first arc of its target; * Follow the <code>follow</code> arc and read the first arc of its target;
* this changes the provided <code>arc</code> (2nd arg) in-place and returns * this changes the provided <code>arc</code> (2nd arg) in-place and returns
@ -920,8 +948,10 @@ public final class FST<T> {
final byte b = in.readByte(); final byte b = in.readByte();
if (b == ARCS_AS_FIXED_ARRAY) { if (b == ARCS_AS_FIXED_ARRAY) {
//System.out.println(" nextArc fake array"); //System.out.println(" nextArc fixed array");
in.readVInt(); in.readVInt();
// Skip bytesPerArc:
if (packed) { if (packed) {
in.readVInt(); in.readVInt();
} else { } else {
@ -1024,7 +1054,7 @@ public final class FST<T> {
//System.out.println(" abs code=" + code); //System.out.println(" abs code=" + code);
} }
} else { } else {
arc.target = in.readInt(); arc.target = readUnpackedNodeTarget(in);
} }
arc.nextArc = in.getPosition(); arc.nextArc = in.getPosition();
} }
@ -1147,7 +1177,7 @@ public final class FST<T> {
if (packed) { if (packed) {
in.readVInt(); in.readVInt();
} else { } else {
in.readInt(); readUnpackedNodeTarget(in);
} }
} }
@ -1354,6 +1384,7 @@ public final class FST<T> {
// Creates a packed FST // Creates a packed FST
private FST(INPUT_TYPE inputType, Outputs<T> outputs) { private FST(INPUT_TYPE inputType, Outputs<T> outputs) {
version = VERSION_CURRENT;
packed = true; packed = true;
this.inputType = inputType; this.inputType = inputType;
// 32 KB blocks: // 32 KB blocks:

View File

@ -58,7 +58,6 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -116,6 +115,38 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
} }
*/ */
/*
public void testCreateMoreTermsIndex() throws Exception {
// we use a real directory name that is not cleaned up,
// because this method is only used to create backwards
// indexes:
File indexDir = new File("moreterms");
_TestUtil.rmDir(indexDir);
Directory dir = newFSDirectory(indexDir);
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
mp.setUseCompoundFile(false);
mp.setNoCFSRatio(1.0);
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
// TODO: remove randomness
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
.setMergePolicy(mp);
conf.setCodec(Codec.forName("Lucene40"));
IndexWriter writer = new IndexWriter(dir, conf);
LineFileDocs docs = new LineFileDocs(null, true);
for(int i=0;i<50;i++) {
writer.addDocument(docs.nextDoc());
}
writer.close();
dir.close();
// Gives you time to copy the index out!: (there is also
// a test option to not remove temp dir...):
Thread.sleep(100000);
}
*/
final static String[] oldNames = {"40.cfs", final static String[] oldNames = {"40.cfs",
"40.nocfs", "40.nocfs",
}; };
@ -908,4 +939,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
dir.close(); dir.close();
} }
} }
public static final String moreTermsIndex = "moreterms.40.zip";
public void testMoreTerms() throws Exception {
File oldIndexDir = _TestUtil.getTempDir("moreterms");
_TestUtil.unzip(getDataFile(moreTermsIndex), oldIndexDir);
Directory dir = newFSDirectory(oldIndexDir);
// TODO: more tests
_TestUtil.checkIndex(dir);
dir.close();
}
} }

View File

@ -289,6 +289,13 @@ public class FSTTester<T> {
null, null,
willRewrite, willRewrite,
true); true);
if (LuceneTestCase.VERBOSE) {
if (willRewrite) {
System.out.println("TEST: packed FST");
} else {
System.out.println("TEST: non-packed FST");
}
}
for(InputOutput<T> pair : pairs) { for(InputOutput<T> pair : pairs) {
if (pair.output instanceof List) { if (pair.output instanceof List) {