mirror of https://github.com/apache/lucene.git
LUCENE-4677: use vInt not int to encode arc's target address in un-packed FSTs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1432466 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2220c6e8ba
commit
d578775e8f
|
@ -19,6 +19,9 @@ Changes in backwards compatibility policy
|
||||||
(Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless,
|
(Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless,
|
||||||
Robert Muir)
|
Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4677: unpacked FSTs now use vInt to encode the node target,
|
||||||
|
to reduce their size (Mike McCandless)
|
||||||
|
|
||||||
* LUCENE-4678: FST now uses a paged byte[] structure instead of a
|
* LUCENE-4678: FST now uses a paged byte[] structure instead of a
|
||||||
single byte[] internally, to avoid large memory spikes during
|
single byte[] internally, to avoid large memory spikes during
|
||||||
building (James Dyer, Mike McCandless)
|
building (James Dyer, Mike McCandless)
|
||||||
|
|
|
@ -27,6 +27,11 @@ import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
/*
|
||||||
|
import java.io.Writer;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
||||||
|
@ -124,7 +129,10 @@ public final class FST<T> {
|
||||||
/** Added optional packed format. */
|
/** Added optional packed format. */
|
||||||
private final static int VERSION_PACKED = 3;
|
private final static int VERSION_PACKED = 3;
|
||||||
|
|
||||||
private final static int VERSION_CURRENT = VERSION_PACKED;
|
/** Changed from int to vInt for encoding arc targets. */
|
||||||
|
private final static int VERSION_VINT_TARGET = 4;
|
||||||
|
|
||||||
|
private final static int VERSION_CURRENT = VERSION_VINT_TARGET;
|
||||||
|
|
||||||
// Never serialized; just used to represent the virtual
|
// Never serialized; just used to represent the virtual
|
||||||
// final node w/ no arcs:
|
// final node w/ no arcs:
|
||||||
|
@ -259,12 +267,15 @@ public final class FST<T> {
|
||||||
// clear early on:
|
// clear early on:
|
||||||
private GrowableWriter inCounts;
|
private GrowableWriter inCounts;
|
||||||
|
|
||||||
|
private final int version;
|
||||||
|
|
||||||
// make a new empty FST, for building; Builder invokes
|
// make a new empty FST, for building; Builder invokes
|
||||||
// this ctor
|
// this ctor
|
||||||
FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) {
|
FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) {
|
||||||
this.inputType = inputType;
|
this.inputType = inputType;
|
||||||
this.outputs = outputs;
|
this.outputs = outputs;
|
||||||
this.allowArrayArcs = allowArrayArcs;
|
this.allowArrayArcs = allowArrayArcs;
|
||||||
|
version = VERSION_CURRENT;
|
||||||
// 32 KB blocks:
|
// 32 KB blocks:
|
||||||
bytes = new BytesStore(15);
|
bytes = new BytesStore(15);
|
||||||
// pad: ensure no node gets address 0 which is reserved to mean
|
// pad: ensure no node gets address 0 which is reserved to mean
|
||||||
|
@ -289,7 +300,7 @@ public final class FST<T> {
|
||||||
this.outputs = outputs;
|
this.outputs = outputs;
|
||||||
// NOTE: only reads most recent format; we don't have
|
// NOTE: only reads most recent format; we don't have
|
||||||
// back-compat promise for FSTs (they are experimental):
|
// back-compat promise for FSTs (they are experimental):
|
||||||
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED);
|
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET);
|
||||||
packed = in.readByte() == 1;
|
packed = in.readByte() == 1;
|
||||||
if (in.readByte() == 1) {
|
if (in.readByte() == 1) {
|
||||||
// accepts empty string
|
// accepts empty string
|
||||||
|
@ -350,6 +361,15 @@ public final class FST<T> {
|
||||||
// building; we need to break out mutable FST from
|
// building; we need to break out mutable FST from
|
||||||
// immutable
|
// immutable
|
||||||
allowArrayArcs = false;
|
allowArrayArcs = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
if (bytes.length == 665) {
|
||||||
|
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
||||||
|
Util.toDot(this, w, false, false);
|
||||||
|
w.close();
|
||||||
|
System.out.println("Wrote FST to out.dot");
|
||||||
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
public INPUT_TYPE getInputType() {
|
public INPUT_TYPE getInputType() {
|
||||||
|
@ -661,7 +681,7 @@ public final class FST<T> {
|
||||||
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
|
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
|
||||||
assert target.node > 0;
|
assert target.node > 0;
|
||||||
//System.out.println(" write target");
|
//System.out.println(" write target");
|
||||||
bytes.writeInt(target.node);
|
bytes.writeVInt(target.node);
|
||||||
}
|
}
|
||||||
|
|
||||||
// just write the arcs "like normal" on first pass,
|
// just write the arcs "like normal" on first pass,
|
||||||
|
@ -800,12 +820,10 @@ public final class FST<T> {
|
||||||
}
|
}
|
||||||
if (arc.flag(BIT_STOP_NODE)) {
|
if (arc.flag(BIT_STOP_NODE)) {
|
||||||
} else if (arc.flag(BIT_TARGET_NEXT)) {
|
} else if (arc.flag(BIT_TARGET_NEXT)) {
|
||||||
|
} else if (packed) {
|
||||||
|
in.readVInt();
|
||||||
} else {
|
} else {
|
||||||
if (packed) {
|
readUnpackedNodeTarget(in);
|
||||||
in.readVInt();
|
|
||||||
} else {
|
|
||||||
in.skipBytes(4);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
arc.flags = in.readByte();
|
arc.flags = in.readByte();
|
||||||
}
|
}
|
||||||
|
@ -819,6 +837,16 @@ public final class FST<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int readUnpackedNodeTarget(BytesReader in) throws IOException {
|
||||||
|
int target;
|
||||||
|
if (version < VERSION_VINT_TARGET) {
|
||||||
|
target = in.readInt();
|
||||||
|
} else {
|
||||||
|
target = in.readVInt();
|
||||||
|
}
|
||||||
|
return target;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Follow the <code>follow</code> arc and read the first arc of its target;
|
* Follow the <code>follow</code> arc and read the first arc of its target;
|
||||||
* this changes the provided <code>arc</code> (2nd arg) in-place and returns
|
* this changes the provided <code>arc</code> (2nd arg) in-place and returns
|
||||||
|
@ -920,8 +948,10 @@ public final class FST<T> {
|
||||||
|
|
||||||
final byte b = in.readByte();
|
final byte b = in.readByte();
|
||||||
if (b == ARCS_AS_FIXED_ARRAY) {
|
if (b == ARCS_AS_FIXED_ARRAY) {
|
||||||
//System.out.println(" nextArc fake array");
|
//System.out.println(" nextArc fixed array");
|
||||||
in.readVInt();
|
in.readVInt();
|
||||||
|
|
||||||
|
// Skip bytesPerArc:
|
||||||
if (packed) {
|
if (packed) {
|
||||||
in.readVInt();
|
in.readVInt();
|
||||||
} else {
|
} else {
|
||||||
|
@ -1024,7 +1054,7 @@ public final class FST<T> {
|
||||||
//System.out.println(" abs code=" + code);
|
//System.out.println(" abs code=" + code);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
arc.target = in.readInt();
|
arc.target = readUnpackedNodeTarget(in);
|
||||||
}
|
}
|
||||||
arc.nextArc = in.getPosition();
|
arc.nextArc = in.getPosition();
|
||||||
}
|
}
|
||||||
|
@ -1147,7 +1177,7 @@ public final class FST<T> {
|
||||||
if (packed) {
|
if (packed) {
|
||||||
in.readVInt();
|
in.readVInt();
|
||||||
} else {
|
} else {
|
||||||
in.readInt();
|
readUnpackedNodeTarget(in);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1354,6 +1384,7 @@ public final class FST<T> {
|
||||||
|
|
||||||
// Creates a packed FST
|
// Creates a packed FST
|
||||||
private FST(INPUT_TYPE inputType, Outputs<T> outputs) {
|
private FST(INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||||
|
version = VERSION_CURRENT;
|
||||||
packed = true;
|
packed = true;
|
||||||
this.inputType = inputType;
|
this.inputType = inputType;
|
||||||
// 32 KB blocks:
|
// 32 KB blocks:
|
||||||
|
|
|
@ -58,7 +58,6 @@ import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.store.BaseDirectoryWrapper;
|
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.MockDirectoryWrapper;
|
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
import org.apache.lucene.util.Bits;
|
import org.apache.lucene.util.Bits;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -100,7 +99,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
||||||
createIndex("index.nocfs", false, false);
|
createIndex("index.nocfs", false, false);
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
// These are only needed for the special upgrade test to verify
|
// These are only needed for the special upgrade test to verify
|
||||||
// that also single-segment indexes are correctly upgraded by IndexUpgrader.
|
// that also single-segment indexes are correctly upgraded by IndexUpgrader.
|
||||||
|
@ -116,8 +115,40 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
public void testCreateMoreTermsIndex() throws Exception {
|
||||||
|
// we use a real directory name that is not cleaned up,
|
||||||
|
// because this method is only used to create backwards
|
||||||
|
// indexes:
|
||||||
|
File indexDir = new File("moreterms");
|
||||||
|
_TestUtil.rmDir(indexDir);
|
||||||
|
Directory dir = newFSDirectory(indexDir);
|
||||||
|
|
||||||
|
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
|
||||||
|
mp.setUseCompoundFile(false);
|
||||||
|
mp.setNoCFSRatio(1.0);
|
||||||
|
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
|
||||||
|
// TODO: remove randomness
|
||||||
|
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
|
||||||
|
.setMergePolicy(mp);
|
||||||
|
conf.setCodec(Codec.forName("Lucene40"));
|
||||||
|
IndexWriter writer = new IndexWriter(dir, conf);
|
||||||
|
LineFileDocs docs = new LineFileDocs(null, true);
|
||||||
|
for(int i=0;i<50;i++) {
|
||||||
|
writer.addDocument(docs.nextDoc());
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
dir.close();
|
||||||
|
|
||||||
|
// Gives you time to copy the index out!: (there is also
|
||||||
|
// a test option to not remove temp dir...):
|
||||||
|
Thread.sleep(100000);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
final static String[] oldNames = {"40.cfs",
|
final static String[] oldNames = {"40.cfs",
|
||||||
"40.nocfs",
|
"40.nocfs",
|
||||||
};
|
};
|
||||||
|
|
||||||
final String[] unsupportedNames = {"19.cfs",
|
final String[] unsupportedNames = {"19.cfs",
|
||||||
|
@ -145,7 +176,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
||||||
};
|
};
|
||||||
|
|
||||||
final static String[] oldSingleSegmentNames = {"40.optimized.cfs",
|
final static String[] oldSingleSegmentNames = {"40.optimized.cfs",
|
||||||
"40.optimized.nocfs",
|
"40.optimized.nocfs",
|
||||||
};
|
};
|
||||||
|
|
||||||
static Map<String,Directory> oldIndexDirs;
|
static Map<String,Directory> oldIndexDirs;
|
||||||
|
@ -908,4 +939,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final String moreTermsIndex = "moreterms.40.zip";
|
||||||
|
|
||||||
|
public void testMoreTerms() throws Exception {
|
||||||
|
File oldIndexDir = _TestUtil.getTempDir("moreterms");
|
||||||
|
_TestUtil.unzip(getDataFile(moreTermsIndex), oldIndexDir);
|
||||||
|
Directory dir = newFSDirectory(oldIndexDir);
|
||||||
|
// TODO: more tests
|
||||||
|
_TestUtil.checkIndex(dir);
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
|
@ -289,6 +289,13 @@ public class FSTTester<T> {
|
||||||
null,
|
null,
|
||||||
willRewrite,
|
willRewrite,
|
||||||
true);
|
true);
|
||||||
|
if (LuceneTestCase.VERBOSE) {
|
||||||
|
if (willRewrite) {
|
||||||
|
System.out.println("TEST: packed FST");
|
||||||
|
} else {
|
||||||
|
System.out.println("TEST: non-packed FST");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for(InputOutput<T> pair : pairs) {
|
for(InputOutput<T> pair : pairs) {
|
||||||
if (pair.output instanceof List) {
|
if (pair.output instanceof List) {
|
||||||
|
|
Loading…
Reference in New Issue