From 4104901e64e832a96b4709bf6be0b2da1d1c8b6d Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 13 Jun 2012 13:12:44 +0000 Subject: [PATCH] LUCENE-4120: FST.pack: Use packed integer arrays for improved memory efficiency. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1349826 13f79535-47bb-0310-9956-ffa450edef68 --- .../ja/util/TokenInfoDictionaryBuilder.java | 3 +- .../codecs/memory/MemoryPostingsFormat.java | 17 ++- .../org/apache/lucene/index/DocValues.java | 6 + .../org/apache/lucene/util/fst/Builder.java | 23 +++- .../java/org/apache/lucene/util/fst/FST.java | 116 +++++++++--------- .../DirectPacked64SingleBlockReader.java | 5 + .../util/packed/DirectPackedReader.java | 5 + .../lucene/util/packed/GrowableWriter.java | 14 +++ .../util/packed/Packed64SingleBlock.java | 5 + .../apache/lucene/util/packed/PackedInts.java | 55 +++++++-- .../lucene/index/TestRollingUpdates.java | 2 +- .../org/apache/lucene/util/fst/TestFSTs.java | 9 +- .../lucene/util/packed/TestPackedInts.java | 36 ++++++ .../org/apache/lucene/index/RandomCodec.java | 4 +- 14 files changed, 217 insertions(+), 83 deletions(-) diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java index 43800acb5c9..b39044ff2ac 100644 --- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java @@ -37,6 +37,7 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.packed.PackedInts; import com.ibm.icu.text.Normalizer2; @@ -161,7 +162,7 @@ public class TokenInfoDictionaryBuilder { offset = next; } - final FST fst = fstBuilder.finish().pack(2, 100000); + final FST fst = fstBuilder.finish().pack(2, 100000, PackedInts.DEFAULT); System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... "); dictionary.setFST(fst); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java index ae9ded93096..91c2aa57645 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java @@ -54,6 +54,7 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.PackedInts; // TODO: would be nice to somehow allow this to act like // InstantiatedIndex, by never writing to disk; ie you write @@ -81,14 +82,16 @@ import org.apache.lucene.util.fst.Util; public class MemoryPostingsFormat extends PostingsFormat { private final boolean doPackFST; + private final float acceptableOverheadRatio; public MemoryPostingsFormat() { - this(false); + this(false, PackedInts.DEFAULT); } - public MemoryPostingsFormat(boolean doPackFST) { + public MemoryPostingsFormat(boolean doPackFST, float acceptableOverheadRatio) { super("Memory"); this.doPackFST = doPackFST; + this.acceptableOverheadRatio = acceptableOverheadRatio; } @Override @@ -102,13 +105,15 @@ public class MemoryPostingsFormat extends PostingsFormat { private final Builder builder; private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); private final boolean doPackFST; + private final float acceptableOverheadRatio; private int termCount; - public TermsWriter(IndexOutput out, FieldInfo field, boolean doPackFST) { + public TermsWriter(IndexOutput out, FieldInfo field, boolean doPackFST, float acceptableOverheadRatio) { this.out = out; this.field = field; this.doPackFST = doPackFST; - builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doPackFST); + this.acceptableOverheadRatio = acceptableOverheadRatio; + builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doPackFST, acceptableOverheadRatio); } private class PostingsWriter extends PostingsConsumer { @@ -265,7 +270,7 @@ public class MemoryPostingsFormat extends PostingsFormat { out.writeVInt(docCount); FST fst = builder.finish(); if (doPackFST) { - fst = fst.pack(3, Math.max(10, fst.getNodeCount()/4)); + fst = fst.pack(3, Math.max(10, fst.getNodeCount()/4), acceptableOverheadRatio); } fst.save(out); //System.out.println("finish field=" + field.name + " fp=" + out.getFilePointer()); @@ -290,7 +295,7 @@ public class MemoryPostingsFormat extends PostingsFormat { @Override public TermsConsumer addField(FieldInfo field) { //System.out.println("\naddField field=" + field.name); - return new TermsWriter(out, field, doPackFST); + return new TermsWriter(out, field, doPackFST, acceptableOverheadRatio); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValues.java b/lucene/core/src/java/org/apache/lucene/index/DocValues.java index cb9b3c0a789..3ff6b97f061 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValues.java @@ -32,6 +32,7 @@ import org.apache.lucene.document.PackedLongDocValuesField; // javadocs import org.apache.lucene.document.ShortDocValuesField; // javadocs import org.apache.lucene.document.SortedBytesDocValuesField; // javadocs import org.apache.lucene.document.StraightBytesDocValuesField; // javadocs +import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.packed.PackedInts; @@ -411,6 +412,11 @@ public abstract class DocValues implements Closeable { Arrays.fill(arr, off, off+len, 0); return len; } + + @Override + public long ramBytesUsed() { + return 0; + } }; return new SortedSource(type, BytesRef.getUTF8SortedAsUnicodeComparator()) { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java index 2d8d9d83704..ea9fd45b358 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java @@ -23,6 +23,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc +import org.apache.lucene.util.packed.PackedInts; /** * Builds a minimal FST (maps an IntsRef term to an arbitrary @@ -83,7 +84,18 @@ public class Builder { * pruning options turned off. */ public Builder(FST.INPUT_TYPE inputType, Outputs outputs) { - this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false); + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false, PackedInts.COMPACT); + } + + /** + * Instantiates an FST/FSA builder with {@link PackedInts#DEFAULT} + * acceptableOverheadRatio. + */ + public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, + boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, + FreezeTail freezeTail, boolean willPackFST) { + this(inputType, minSuffixCount1, minSuffixCount2, doShareSuffix, doShareNonSingletonNodes, + shareMaxTailLength, outputs, freezeTail, willPackFST, PackedInts.DEFAULT); } /** @@ -126,17 +138,20 @@ public class Builder { * @param willPackFST Pass true if you will pack the FST before saving. This * causes the FST to create additional data structures internally to facilitate packing, but * it means the resulting FST cannot be saved: it must - * first be packed using {@link FST#pack(int, int)}}. + * first be packed using {@link FST#pack(int, int, float)} + * + * @param acceptableOverheadRatio How to trade speed for space when building the FST. This option + * is only relevant when willPackFST is true. @see PackedInts#getMutable(int, int, float) */ public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, - FreezeTail freezeTail, boolean willPackFST) { + FreezeTail freezeTail, boolean willPackFST, float acceptableOverheadRatio) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; this.freezeTail = freezeTail; this.doShareNonSingletonNodes = doShareNonSingletonNodes; this.shareMaxTailLength = shareMaxTailLength; - fst = new FST(inputType, outputs, willPackFST); + fst = new FST(inputType, outputs, willPackFST, acceptableOverheadRatio); if (doShareSuffix) { dedupHash = new NodeHash(fst); } else { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 9a46f258726..f69cbccb6ef 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -37,8 +37,9 @@ import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.PriorityQueue; -import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.Builder.UnCompiledNode; +import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.packed.PackedInts; // TODO: break this into WritableFST and ReadOnlyFST.. then // we can have subclasses of ReadOnlyFST to handle the @@ -155,7 +156,7 @@ public final class FST { public int arcWithOutputCount; private final boolean packed; - private final int[] nodeRefToAddress; + private PackedInts.Reader nodeRefToAddress; // If arc has this label then that arc is final/accepted public static final int END_LABEL = -1; @@ -252,25 +253,23 @@ public final class FST { private final BytesWriter writer; - // TODO: we can save RAM here by using growable packed - // ints...: - private int[] nodeAddress; + private GrowableWriter nodeAddress; // TODO: we could be smarter here, and prune periodically // as we go; high in-count nodes will "usually" become // clear early on: - private int[] inCounts; + private GrowableWriter inCounts; // make a new empty FST, for building; Builder invokes // this ctor - FST(INPUT_TYPE inputType, Outputs outputs, boolean willPackFST) { + FST(INPUT_TYPE inputType, Outputs outputs, boolean willPackFST, float acceptableOverheadRatio) { this.inputType = inputType; this.outputs = outputs; bytes = new byte[128]; NO_OUTPUT = outputs.getNoOutput(); if (willPackFST) { - nodeAddress = new int[8]; - inCounts = new int[8]; + nodeAddress = new GrowableWriter(PackedInts.bitsRequired(bytes.length - 1), 8, acceptableOverheadRatio); + inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio); } else { nodeAddress = null; inCounts = null; @@ -320,11 +319,7 @@ public final class FST { throw new IllegalStateException("invalid input type " + t); } if (packed) { - final int nodeRefCount = in.readVInt(); - nodeRefToAddress = new int[nodeRefCount]; - for(int idx=0;idx { public int sizeInBytes() { int size = bytes.length; if (packed) { - size += nodeRefToAddress.length * RamUsageEstimator.NUM_BYTES_INT; + size += nodeRefToAddress.ramBytesUsed(); } else if (nodeAddress != null) { - size += nodeAddress.length * RamUsageEstimator.NUM_BYTES_INT; - size += inCounts.length * RamUsageEstimator.NUM_BYTES_INT; + size += nodeAddress.ramBytesUsed(); + size += inCounts.ramBytesUsed(); } return size; } @@ -374,7 +369,7 @@ public final class FST { private int getNodeAddress(int node) { if (nodeAddress != null) { // Deref - return nodeAddress[node]; + return (int) nodeAddress.get(node); } else { // Straight return node; @@ -444,6 +439,9 @@ public final class FST { if (nodeAddress != null) { throw new IllegalStateException("cannot save an FST pre-packed FST; it must first be packed"); } + if (packed && !(nodeRefToAddress instanceof PackedInts.Mutable)) { + throw new IllegalStateException("cannot save a FST which has been loaded from disk "); + } CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); if (packed) { out.writeByte((byte) 1); @@ -469,11 +467,7 @@ public final class FST { } out.writeByte(t); if (packed) { - assert nodeRefToAddress != null; - out.writeVInt(nodeRefToAddress.length); - for(int idx=0;idx { if (!targetHasArcs) { flags += BIT_STOP_NODE; } else if (inCounts != null) { - inCounts[target.node]++; + inCounts.set(target.node, inCounts.get(target.node) + 1); } if (arc.output != NO_OUTPUT) { @@ -715,11 +709,11 @@ public final class FST { final int node; if (nodeAddress != null) { // Nodes are addressed by 1+ord: - if (nodeCount == nodeAddress.length) { - nodeAddress = ArrayUtil.grow(nodeAddress); - inCounts = ArrayUtil.grow(inCounts); + if (nodeCount == nodeAddress.size()) { + nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue())); + inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue())); } - nodeAddress[nodeCount] = endAddress; + nodeAddress.set(nodeCount, endAddress); // System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress); node = nodeCount; } else { @@ -1005,9 +999,9 @@ public final class FST { // Address is delta-coded from current address: arc.target = pos + code; //System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target); - } else if (code < nodeRefToAddress.length) { + } else if (code < nodeRefToAddress.size()) { // Deref - arc.target = nodeRefToAddress[code]; + arc.target = (int) nodeRefToAddress.get(code); //System.out.println(" deref code=" + code + " target=" + arc.target); } else { // Absolute @@ -1420,7 +1414,7 @@ public final class FST { */ // Creates a packed FST - private FST(INPUT_TYPE inputType, int[] nodeRefToAddress, Outputs outputs) { + private FST(INPUT_TYPE inputType, PackedInts.Reader nodeRefToAddress, Outputs outputs) { packed = true; this.inputType = inputType; bytes = new byte[128]; @@ -1432,8 +1426,10 @@ public final class FST { /** Expert: creates an FST by packing this one. This * process requires substantial additional RAM (currently - * ~8 bytes per node), but then should produce a smaller FST. */ - public FST pack(int minInCountDeref, int maxDerefNodes) throws IOException { + * up to ~8 bytes per node depending on + * acceptableOverheadRatio), but then should + * produce a smaller FST. */ + public FST pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException { // TODO: other things to try // - renumber the nodes to get more next / better locality? @@ -1454,22 +1450,22 @@ public final class FST { final BytesReader r = getBytesReader(0); - final int topN = Math.min(maxDerefNodes, inCounts.length); + final int topN = Math.min(maxDerefNodes, inCounts.size()); // Find top nodes with highest number of incoming arcs: NodeQueue q = new NodeQueue(topN); // TODO: we could use more RAM efficient selection algo here... NodeAndInCount bottom = null; - for(int node=0;node= minInCountDeref) { + for(int node=0; node= minInCountDeref) { if (bottom == null) { - q.add(new NodeAndInCount(node, inCounts[node])); + q.add(new NodeAndInCount(node, (int) inCounts.get(node))); if (q.size() == topN) { bottom = q.top(); } - } else if (inCounts[node] > bottom.count) { - q.insertWithOverflow(new NodeAndInCount(node, inCounts[node])); + } else if (inCounts.get(node) > bottom.count) { + q.insertWithOverflow(new NodeAndInCount(node, (int) inCounts.get(node))); } } } @@ -1484,20 +1480,17 @@ public final class FST { //System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo); } - // TODO: we can use packed ints: - // +1 because node ords start at 1 (0 is reserved as - // stop node): - final int[] nodeRefToAddressIn = new int[topNodeMap.size()]; - - final FST fst = new FST(inputType, nodeRefToAddressIn, outputs); + final FST fst = new FST(inputType, null, outputs); final BytesWriter writer = fst.writer; - - final int[] newNodeAddress = new int[1+nodeCount]; + + // +1 because node ords start at 1 (0 is reserved as stop node): + final GrowableWriter newNodeAddress = new GrowableWriter( + PackedInts.bitsRequired(bytes.length), 1 + nodeCount, acceptableOverheadRatio); // Fill initial coarse guess: for(int node=1;node<=nodeCount;node++) { - newNodeAddress[node] = 1 + bytes.length - nodeAddress[node]; + newNodeAddress.set(node, 1 + bytes.length - nodeAddress.get(node)); } int absCount; @@ -1537,11 +1530,11 @@ public final class FST { fst.nodeCount++; final int address = writer.posWrite; //System.out.println(" node: " + node + " address=" + address); - if (address != newNodeAddress[node]) { - addressError = address - newNodeAddress[node]; + if (address != newNodeAddress.get(node)) { + addressError = address - (int) newNodeAddress.get(node); //System.out.println(" change: " + (address - newNodeAddress[node])); changed = true; - newNodeAddress[node] = address; + newNodeAddress.set(node, address); changedCount++; } @@ -1621,10 +1614,10 @@ public final class FST { if (ptr != null) { absPtr = ptr; } else { - absPtr = topNodeMap.size() + newNodeAddress[arc.target] + addressError; + absPtr = topNodeMap.size() + (int) newNodeAddress.get(arc.target) + addressError; } - int delta = newNodeAddress[arc.target] + addressError - writer.posWrite - 2; + int delta = (int) newNodeAddress.get(arc.target) + addressError - writer.posWrite - 2; if (delta < 0) { //System.out.println("neg: " + delta); anyNegDelta = true; @@ -1654,7 +1647,7 @@ public final class FST { if (doWriteTarget) { - int delta = newNodeAddress[arc.target] + addressError - writer.posWrite; + int delta = (int) newNodeAddress.get(arc.target) + addressError - writer.posWrite; if (delta < 0) { anyNegDelta = true; //System.out.println("neg: " + delta); @@ -1745,11 +1738,20 @@ public final class FST { //System.out.println(" " + changedCount + " of " + fst.nodeCount + " changed; retry"); } - for(Map.Entry ent : topNodeMap.entrySet()) { - nodeRefToAddressIn[ent.getValue()] = newNodeAddress[ent.getKey()]; + long maxAddress = 0; + for (int key : topNodeMap.keySet()) { + maxAddress = Math.max(maxAddress, newNodeAddress.get(key)); } - fst.startNode = newNodeAddress[startNode]; + PackedInts.Mutable nodeRefToAddressIn = PackedInts.getMutable(topNodeMap.size(), + PackedInts.bitsRequired(maxAddress), acceptableOverheadRatio); + for(Map.Entry ent : topNodeMap.entrySet()) { + nodeRefToAddressIn.set(ent.getValue(), newNodeAddress.get(ent.getKey())); + } + fst.nodeRefToAddress = nodeRefToAddressIn; + + + fst.startNode = (int) newNodeAddress.get(startNode); //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode); if (emptyOutput != null) { diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/DirectPacked64SingleBlockReader.java b/lucene/core/src/java/org/apache/lucene/util/packed/DirectPacked64SingleBlockReader.java index 49744cec1d1..ce935925387 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/DirectPacked64SingleBlockReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/DirectPacked64SingleBlockReader.java @@ -51,4 +51,9 @@ final class DirectPacked64SingleBlockReader extends PackedInts.ReaderImpl { throw new IllegalStateException("failed", e); } } + + @Override + public long ramBytesUsed() { + return 0; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/DirectPackedReader.java b/lucene/core/src/java/org/apache/lucene/util/packed/DirectPackedReader.java index 909acc9cdd6..309849fa4c8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/DirectPackedReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/DirectPackedReader.java @@ -73,4 +73,9 @@ final class DirectPackedReader extends PackedInts.ReaderImpl { throw new IllegalStateException("failed", ioe); } } + + @Override + public long ramBytesUsed() { + return 0; + } } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/GrowableWriter.java b/lucene/core/src/java/org/apache/lucene/util/packed/GrowableWriter.java index 83a76efbac5..d87d434288a 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/GrowableWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/GrowableWriter.java @@ -17,6 +17,10 @@ package org.apache.lucene.util.packed; * limitations under the License. */ +import java.io.IOException; + +import org.apache.lucene.store.DataOutput; + /** * Implements {@link PackedInts.Mutable}, but grows the * bit count of the underlying packed ints on-demand. @@ -111,4 +115,14 @@ public class GrowableWriter implements PackedInts.Mutable { current.fill(fromIndex, toIndex, val); } + @Override + public long ramBytesUsed() { + return current.ramBytesUsed(); + } + + @Override + public void save(DataOutput out) throws IOException { + current.save(out); + } + } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java b/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java index b0e50aedb50..e89ad5e7759 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/Packed64SingleBlock.java @@ -284,6 +284,11 @@ abstract class Packed64SingleBlock extends PackedInts.MutableImpl { return RamUsageEstimator.sizeOf(blocks); } + @Override + protected int getFormat() { + return PackedInts.PACKED_SINGLE_BLOCK; + } + @Override public String toString() { return getClass().getSimpleName() + "(bitsPerValue=" + bitsPerValue diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java b/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java index 4c6572c4bf4..583ba43809c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java @@ -100,6 +100,11 @@ public class PackedInts { */ int size(); + /** + * Return the in-memory size in bytes. + */ + long ramBytesUsed(); + /** * Expert: if the bit-width of this reader matches one of * java's native types, returns the underlying array @@ -118,6 +123,7 @@ public class PackedInts { * @see #getArray */ boolean hasArray(); + } /** @@ -171,6 +177,7 @@ public class PackedInts { * @lucene.internal */ public static interface Mutable extends Reader { + /** * Set the value at the given index in the array. * @param index where the value should be positioned. @@ -197,6 +204,13 @@ public class PackedInts { */ void clear(); + /** + * Save this mutable into out. Instantiating a reader from + * the generated data will return a reader with the same number of bits + * per value. + */ + void save(DataOutput out) throws IOException; + } /** @@ -239,6 +253,7 @@ public class PackedInts { } return gets; } + } public static abstract class MutableImpl extends ReaderImpl implements Mutable { @@ -267,6 +282,18 @@ public class PackedInts { } } + protected int getFormat() { + return PACKED; + } + + @Override + public void save(DataOutput out) throws IOException { + Writer writer = getWriterByFormat(out, valueCount, bitsPerValue, getFormat()); + for (int i = 0; i < valueCount; ++i) { + writer.add(get(i)); + } + writer.finish(); + } } /** A write-once Writer. @@ -470,28 +497,40 @@ public class PackedInts { int maxBitsPerValue = bitsPerValue + (int) acceptableOverheadPerValue; if (bitsPerValue <= 8 && maxBitsPerValue >= 8) { - return new PackedWriter(out, valueCount, 8); + return getWriterByFormat(out, valueCount, 8, PACKED); } else if (bitsPerValue <= 16 && maxBitsPerValue >= 16) { - return new PackedWriter(out, valueCount, 16); + return getWriterByFormat(out, valueCount, 16, PACKED); } else if (bitsPerValue <= 32 && maxBitsPerValue >= 32) { - return new PackedWriter(out, valueCount, 32); + return getWriterByFormat(out, valueCount, 32, PACKED); } else if (bitsPerValue <= 64 && maxBitsPerValue >= 64) { - return new PackedWriter(out, valueCount, 64); + return getWriterByFormat(out, valueCount, 64, PACKED); } else if (valueCount <= Packed8ThreeBlocks.MAX_SIZE && bitsPerValue <= 24 && maxBitsPerValue >= 24) { - return new PackedWriter(out, valueCount, 24); + return getWriterByFormat(out, valueCount, 24, PACKED); } else if (valueCount <= Packed16ThreeBlocks.MAX_SIZE && bitsPerValue <= 48 && maxBitsPerValue >= 48) { - return new PackedWriter(out, valueCount, bitsPerValue); + return getWriterByFormat(out, valueCount, 48, PACKED); } else { for (int bpv = bitsPerValue; bpv <= maxBitsPerValue; ++bpv) { if (Packed64SingleBlock.isSupported(bpv)) { float overhead = Packed64SingleBlock.overheadPerValue(bpv); float acceptableOverhead = acceptableOverheadPerValue + bitsPerValue - bpv; if (overhead <= acceptableOverhead) { - return new Packed64SingleBlockWriter(out, valueCount, bpv); + return getWriterByFormat(out, valueCount, bpv, PACKED_SINGLE_BLOCK); } } } - return new PackedWriter(out, valueCount, bitsPerValue); + return getWriterByFormat(out, valueCount, bitsPerValue, PACKED); + } + } + + private static Writer getWriterByFormat(DataOutput out, + int valueCount, int bitsPerValue, int format) throws IOException { + switch (format) { + case PACKED: + return new PackedWriter(out, valueCount, bitsPerValue); + case PACKED_SINGLE_BLOCK: + return new Packed64SingleBlockWriter(out, valueCount, bitsPerValue); + default: + throw new IllegalArgumentException("Unknown format " + format); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java b/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java index a3b5a685a3e..cd131a14c2c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestRollingUpdates.java @@ -41,7 +41,7 @@ public class TestRollingUpdates extends LuceneTestCase { //provider.register(new MemoryCodec()); if (random().nextBoolean()) { - Codec.setDefault(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat(random().nextBoolean()))); + Codec.setDefault(_TestUtil.alwaysPostingsFormat(new MemoryPostingsFormat(random().nextBoolean(), random.nextFloat()))); } final IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index c0e52ee76e5..c526e84786e 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -64,6 +64,7 @@ import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.packed.PackedInts; @SuppressCodecs({ "SimpleText", "Memory" }) public class TestFSTs extends LuceneTestCase { @@ -536,7 +537,7 @@ public class TestFSTs extends LuceneTestCase { if (VERBOSE) { System.out.println("TEST: now rewrite"); } - final FST packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000)); + final FST packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat()); if (VERBOSE) { System.out.println("TEST: now verify packed FST"); } @@ -1182,7 +1183,7 @@ public class TestFSTs extends LuceneTestCase { if (rewriteIter == 1) { if (doRewrite) { // Verify again, with packed FST: - fst = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000)); + fst = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000), random.nextFloat()); } else { break; } @@ -1324,7 +1325,7 @@ public class TestFSTs extends LuceneTestCase { if (doPack) { System.out.println("Pack..."); - fst = fst.pack(4, 100000000); + fst = fst.pack(4, 100000000, random().nextFloat()); System.out.println("New size " + fst.sizeInBytes() + " bytes"); } @@ -1927,7 +1928,7 @@ public class TestFSTs extends LuceneTestCase { final Long nothing = outputs.getNoOutput(); final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs, false); + final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInts.COMPACT); final Builder.UnCompiledNode rootNode = new Builder.UnCompiledNode(b, 0); diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java index c4ff31ddf5e..1d9189d59f8 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java @@ -560,4 +560,40 @@ public class TestPackedInts extends LuceneTestCase { assertEquals(1 << 10, wrt.get(valueCount - 1)); } + public void testSave() throws IOException { + final int valueCount = _TestUtil.nextInt(random(), 1, 2048); + for (int bpv = 1; bpv <= 64; ++bpv) { + final int maxValue = (int) Math.min(PackedInts.maxValue(31), PackedInts.maxValue(bpv)); + final RAMDirectory directory = new RAMDirectory(); + List packedInts = createPackedInts(valueCount, bpv); + for (PackedInts.Mutable mutable : packedInts) { + for (int i = 0; i < mutable.size(); ++i) { + mutable.set(i, random().nextInt(maxValue)); + } + + IndexOutput out = directory.createOutput("packed-ints.bin", IOContext.DEFAULT); + mutable.save(out); + out.close(); + + IndexInput in = directory.openInput("packed-ints.bin", IOContext.DEFAULT); + PackedInts.Reader reader = PackedInts.getReader(in); + assertEquals(mutable.getBitsPerValue(), reader.getBitsPerValue()); + assertEquals(valueCount, reader.size()); + if (mutable instanceof Packed64SingleBlock) { + // make sure that we used the right format so that the reader has + // the same performance characteristics as the mutable that has been + // serialized + assertTrue(reader instanceof Packed64SingleBlock); + } else { + assertFalse(reader instanceof Packed64SingleBlock); + } + for (int i = 0; i < valueCount; ++i) { + assertEquals(mutable.get(i), reader.get(i)); + } + in.close(); + directory.deleteFile("packed-ints.bin"); + } + } + } + } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index 526b3ac6fdb..d9c91d3bab9 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -99,8 +99,8 @@ public class RandomCodec extends Lucene40Codec { new NestedPulsingPostingsFormat(), new Lucene40WithOrds(), new SimpleTextPostingsFormat(), - new MemoryPostingsFormat(true), - new MemoryPostingsFormat(false)); + new MemoryPostingsFormat(true, random.nextFloat()), + new MemoryPostingsFormat(false, random.nextFloat())); Collections.shuffle(formats, random); }