mirror of https://github.com/apache/lucene.git
LUCENE-4677: use vInt not int to encode arc's target address in un-packed FSTs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1432466 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2220c6e8ba
commit
d578775e8f
|
@ -19,6 +19,9 @@ Changes in backwards compatibility policy
|
|||
(Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless,
|
||||
Robert Muir)
|
||||
|
||||
* LUCENE-4677: unpacked FSTs now use vInt to encode the node target,
|
||||
to reduce their size (Mike McCandless)
|
||||
|
||||
* LUCENE-4678: FST now uses a paged byte[] structure instead of a
|
||||
single byte[] internally, to avoid large memory spikes during
|
||||
building (James Dyer, Mike McCandless)
|
||||
|
|
|
@ -27,6 +27,11 @@ import java.io.InputStream;
|
|||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
/*
|
||||
import java.io.Writer;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.FileOutputStream;
|
||||
*/
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -124,7 +129,10 @@ public final class FST<T> {
|
|||
/** Added optional packed format. */
|
||||
private final static int VERSION_PACKED = 3;
|
||||
|
||||
private final static int VERSION_CURRENT = VERSION_PACKED;
|
||||
/** Changed from int to vInt for encoding arc targets. */
|
||||
private final static int VERSION_VINT_TARGET = 4;
|
||||
|
||||
private final static int VERSION_CURRENT = VERSION_VINT_TARGET;
|
||||
|
||||
// Never serialized; just used to represent the virtual
|
||||
// final node w/ no arcs:
|
||||
|
@ -259,12 +267,15 @@ public final class FST<T> {
|
|||
// clear early on:
|
||||
private GrowableWriter inCounts;
|
||||
|
||||
private final int version;
|
||||
|
||||
// make a new empty FST, for building; Builder invokes
|
||||
// this ctor
|
||||
FST(INPUT_TYPE inputType, Outputs<T> outputs, boolean willPackFST, float acceptableOverheadRatio, boolean allowArrayArcs) {
|
||||
this.inputType = inputType;
|
||||
this.outputs = outputs;
|
||||
this.allowArrayArcs = allowArrayArcs;
|
||||
version = VERSION_CURRENT;
|
||||
// 32 KB blocks:
|
||||
bytes = new BytesStore(15);
|
||||
// pad: ensure no node gets address 0 which is reserved to mean
|
||||
|
@ -289,7 +300,7 @@ public final class FST<T> {
|
|||
this.outputs = outputs;
|
||||
// NOTE: only reads most recent format; we don't have
|
||||
// back-compat promise for FSTs (they are experimental):
|
||||
CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED);
|
||||
version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET);
|
||||
packed = in.readByte() == 1;
|
||||
if (in.readByte() == 1) {
|
||||
// accepts empty string
|
||||
|
@ -350,6 +361,15 @@ public final class FST<T> {
|
|||
// building; we need to break out mutable FST from
|
||||
// immutable
|
||||
allowArrayArcs = false;
|
||||
|
||||
/*
|
||||
if (bytes.length == 665) {
|
||||
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
|
||||
Util.toDot(this, w, false, false);
|
||||
w.close();
|
||||
System.out.println("Wrote FST to out.dot");
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
public INPUT_TYPE getInputType() {
|
||||
|
@ -661,7 +681,7 @@ public final class FST<T> {
|
|||
if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) {
|
||||
assert target.node > 0;
|
||||
//System.out.println(" write target");
|
||||
bytes.writeInt(target.node);
|
||||
bytes.writeVInt(target.node);
|
||||
}
|
||||
|
||||
// just write the arcs "like normal" on first pass,
|
||||
|
@ -800,12 +820,10 @@ public final class FST<T> {
|
|||
}
|
||||
if (arc.flag(BIT_STOP_NODE)) {
|
||||
} else if (arc.flag(BIT_TARGET_NEXT)) {
|
||||
} else if (packed) {
|
||||
in.readVInt();
|
||||
} else {
|
||||
if (packed) {
|
||||
in.readVInt();
|
||||
} else {
|
||||
in.skipBytes(4);
|
||||
}
|
||||
readUnpackedNodeTarget(in);
|
||||
}
|
||||
arc.flags = in.readByte();
|
||||
}
|
||||
|
@ -819,6 +837,16 @@ public final class FST<T> {
|
|||
}
|
||||
}
|
||||
|
||||
private int readUnpackedNodeTarget(BytesReader in) throws IOException {
|
||||
int target;
|
||||
if (version < VERSION_VINT_TARGET) {
|
||||
target = in.readInt();
|
||||
} else {
|
||||
target = in.readVInt();
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
/**
|
||||
* Follow the <code>follow</code> arc and read the first arc of its target;
|
||||
* this changes the provided <code>arc</code> (2nd arg) in-place and returns
|
||||
|
@ -920,8 +948,10 @@ public final class FST<T> {
|
|||
|
||||
final byte b = in.readByte();
|
||||
if (b == ARCS_AS_FIXED_ARRAY) {
|
||||
//System.out.println(" nextArc fake array");
|
||||
//System.out.println(" nextArc fixed array");
|
||||
in.readVInt();
|
||||
|
||||
// Skip bytesPerArc:
|
||||
if (packed) {
|
||||
in.readVInt();
|
||||
} else {
|
||||
|
@ -1024,7 +1054,7 @@ public final class FST<T> {
|
|||
//System.out.println(" abs code=" + code);
|
||||
}
|
||||
} else {
|
||||
arc.target = in.readInt();
|
||||
arc.target = readUnpackedNodeTarget(in);
|
||||
}
|
||||
arc.nextArc = in.getPosition();
|
||||
}
|
||||
|
@ -1147,7 +1177,7 @@ public final class FST<T> {
|
|||
if (packed) {
|
||||
in.readVInt();
|
||||
} else {
|
||||
in.readInt();
|
||||
readUnpackedNodeTarget(in);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1354,6 +1384,7 @@ public final class FST<T> {
|
|||
|
||||
// Creates a packed FST
|
||||
private FST(INPUT_TYPE inputType, Outputs<T> outputs) {
|
||||
version = VERSION_CURRENT;
|
||||
packed = true;
|
||||
this.inputType = inputType;
|
||||
// 32 KB blocks:
|
||||
|
|
|
@ -58,7 +58,6 @@ import org.apache.lucene.search.ScoreDoc;
|
|||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -100,7 +99,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
createIndex("index.nocfs", false, false);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
// These are only needed for the special upgrade test to verify
|
||||
// that also single-segment indexes are correctly upgraded by IndexUpgrader.
|
||||
|
@ -116,8 +115,40 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
}
|
||||
|
||||
*/
|
||||
|
||||
/*
|
||||
public void testCreateMoreTermsIndex() throws Exception {
|
||||
// we use a real directory name that is not cleaned up,
|
||||
// because this method is only used to create backwards
|
||||
// indexes:
|
||||
File indexDir = new File("moreterms");
|
||||
_TestUtil.rmDir(indexDir);
|
||||
Directory dir = newFSDirectory(indexDir);
|
||||
|
||||
LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
|
||||
mp.setUseCompoundFile(false);
|
||||
mp.setNoCFSRatio(1.0);
|
||||
mp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY);
|
||||
// TODO: remove randomness
|
||||
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
|
||||
.setMergePolicy(mp);
|
||||
conf.setCodec(Codec.forName("Lucene40"));
|
||||
IndexWriter writer = new IndexWriter(dir, conf);
|
||||
LineFileDocs docs = new LineFileDocs(null, true);
|
||||
for(int i=0;i<50;i++) {
|
||||
writer.addDocument(docs.nextDoc());
|
||||
}
|
||||
writer.close();
|
||||
dir.close();
|
||||
|
||||
// Gives you time to copy the index out!: (there is also
|
||||
// a test option to not remove temp dir...):
|
||||
Thread.sleep(100000);
|
||||
}
|
||||
*/
|
||||
|
||||
final static String[] oldNames = {"40.cfs",
|
||||
"40.nocfs",
|
||||
"40.nocfs",
|
||||
};
|
||||
|
||||
final String[] unsupportedNames = {"19.cfs",
|
||||
|
@ -145,7 +176,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
};
|
||||
|
||||
final static String[] oldSingleSegmentNames = {"40.optimized.cfs",
|
||||
"40.optimized.nocfs",
|
||||
"40.optimized.nocfs",
|
||||
};
|
||||
|
||||
static Map<String,Directory> oldIndexDirs;
|
||||
|
@ -908,4 +939,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
public static final String moreTermsIndex = "moreterms.40.zip";
|
||||
|
||||
public void testMoreTerms() throws Exception {
|
||||
File oldIndexDir = _TestUtil.getTempDir("moreterms");
|
||||
_TestUtil.unzip(getDataFile(moreTermsIndex), oldIndexDir);
|
||||
Directory dir = newFSDirectory(oldIndexDir);
|
||||
// TODO: more tests
|
||||
_TestUtil.checkIndex(dir);
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
|
@ -289,6 +289,13 @@ public class FSTTester<T> {
|
|||
null,
|
||||
willRewrite,
|
||||
true);
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
if (willRewrite) {
|
||||
System.out.println("TEST: packed FST");
|
||||
} else {
|
||||
System.out.println("TEST: non-packed FST");
|
||||
}
|
||||
}
|
||||
|
||||
for(InputOutput<T> pair : pairs) {
|
||||
if (pair.output instanceof List) {
|
||||
|
|
Loading…
Reference in New Issue