LUCENE-4677: use vInt not int to encode arc's target address in un-packed FSTs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1432466 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-01-12 16:31:28 +00:00
parent 2220c6e8ba
commit d578775e8f
5 changed files with 98 additions and 15 deletions

View File

@ -19,6 +19,9 @@ Changes in backwards compatibility policy
(Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless,
Robert Muir)
* LUCENE-4677: unpacked FSTs now use vInt to encode the node target,
to reduce their size (Mike McCandless)
* LUCENE-4678: FST now uses a paged byte[] structure instead of a
single byte[] internally, to avoid large memory spikes during
building (James Dyer, Mike McCandless)

View File

@ -27,6 +27,11 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;
/*
import java.io.Writer;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
*/
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
@ -124,7 +129,10 @@ public final class FST<T> {
/** Added optional packed format. */
private final static int VERSION_PACKED = 3;
private final static int VERSION_CURRENT = VERSION_PACKED;
/** Changed from int to vInt for encoding arc targets. */
private final static int VERSION_VINT_TARGET = 4;
private final static int VERSION_CURRENT = VERSION_VINT_TARGET;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
@ -259,12 +267,15 @@ public final class FST<T> {
// clear early on:
private GrowableWriter inCounts;
private final int version;
// make a new empty FST, for building; Builder invokes
// this ctor
FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) {
this.inputType = inputType;
this.outputs = outputs;
this.allowArrayArcs = allowArrayArcs;
version = VERSION_CURRENT;
// 32 KB blocks:
bytes = new BytesStore(15);
// pad: ensure no node gets address 0 which is reserved to mean
@ -289,7 +300,7 @@ public final class FST<T> {
this.outputs = outputs;
// NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental):
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED);
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET);
packed = in.readByte() == 1;
if (in.readByte() == 1) {
// accepts empty string
@ -350,6 +361,15 @@ public final class FST<T> {
// building; we need to break out mutable FST from
// immutable
allowArrayArcs = false;
/*
if (bytes.length == 665) {
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
Util.toDot(this, w, false, false);
w.close();
System.out.println("Wrote FST to out.dot");
}
*/
}
public INPUT_TYPE getInputType() {
@ -661,7 +681,7 @@ public final class FST<T> {
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
assert target.node > 0;
//System.out.println(" write target");
bytes.writeInt(target.node);
bytes.writeVInt(target.node);
}
// just write the arcs "like normal" on first pass,
@ -800,12 +820,10 @@ public final class FST<T> {
}
if (arc.flag(BIT_STOP_NODE)) {
} else if (arc.flag(BIT_TARGET_NEXT)) {
} else if (packed) {
in.readVInt();
} else {
if (packed) {
in.readVInt();
} else {
in.skipBytes(4);
}
readUnpackedNodeTarget(in);
}
arc.flags = in.readByte();
}
@ -819,6 +837,16 @@ public final class FST<T> {
}
}
private int readUnpackedNodeTarget(BytesReader in) throws IOException {
int target;
if (version < VERSION_VINT_TARGET) {
target = in.readInt();
} else {
target = in.readVInt();
}
return target;
}
/**
* Follow the <code>follow</code> arc and read the first arc of its target;
* this changes the provided <code>arc</code> (2nd arg) in-place and returns
@ -920,8 +948,10 @@ public final class FST<T> {
final byte b = in.readByte();
if (b == ARCS_AS_FIXED_ARRAY) {
//System.out.println(" nextArc fake array");
//System.out.println(" nextArc fixed array");
in.readVInt();
// Skip bytesPerArc:
if (packed) {
in.readVInt();
} else {
@ -1024,7 +1054,7 @@ public final class FST<T> {
//System.out.println(" abs code=" + code);
}
} else {
arc.target = in.readInt();
arc.target = readUnpackedNodeTarget(in);
}
arc.nextArc = in.getPosition();
}
@ -1147,7 +1177,7 @@ public final class FST<T> {
if (packed) {
in.readVInt();
} else {
in.readInt();
readUnpackedNodeTarget(in);
}
}
@ -1354,6 +1384,7 @@ public final class FST<T> {
// Creates a packed FST
private FST(INPUT_TYPE inputType, Outputs<T> outputs) {
version = VERSION_CURRENT;
packed = true;
this.inputType = inputType;
// 32 KB blocks:

View File

@ -58,7 +58,6 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
@ -116,8 +115,40 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
}
*/
/*
public void testCreateMoreTermsIndex() throws Exception {
// we use a real directory name that is not cleaned up,
// because this method is only used to create backwards
// indexes:
File indexDir = new File("moreterms");
_TestUtil.rmDir(indexDir);
Directory dir = newFSDirectory(indexDir);
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
mp.setUseCompoundFile(false);
mp.setNoCFSRatio(1.0);
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
// TODO: remove randomness
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
.setMergePolicy(mp);
conf.setCodec(Codec.forName("Lucene40"));
IndexWriter writer = new IndexWriter(dir, conf);
LineFileDocs docs = new LineFileDocs(null, true);
for(int i=0;i<50;i++) {
writer.addDocument(docs.nextDoc());
}
writer.close();
dir.close();
// Gives you time to copy the index out!: (there is also
// a test option to not remove temp dir...):
Thread.sleep(100000);
}
*/
final static String[] oldNames = {"40.cfs",
"40.nocfs",
"40.nocfs",
};
final String[] unsupportedNames = {"19.cfs",
@ -145,7 +176,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
};
final static String[] oldSingleSegmentNames = {"40.optimized.cfs",
"40.optimized.nocfs",
"40.optimized.nocfs",
};
static Map<String,Directory> oldIndexDirs;
@ -908,4 +939,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
dir.close();
}
}
public static final String moreTermsIndex = "moreterms.40.zip";
public void testMoreTerms() throws Exception {
File oldIndexDir = _TestUtil.getTempDir("moreterms");
_TestUtil.unzip(getDataFile(moreTermsIndex), oldIndexDir);
Directory dir = newFSDirectory(oldIndexDir);
// TODO: more tests
_TestUtil.checkIndex(dir);
dir.close();
}
}

View File

@ -289,6 +289,13 @@ public class FSTTester<T> {
null,
willRewrite,
true);
if (LuceneTestCase.VERBOSE) {
if (willRewrite) {
System.out.println("TEST: packed FST");
} else {
System.out.println("TEST: non-packed FST");
}
}
for(InputOutput<T> pair : pairs) {
if (pair.output instanceof List) {