From 28c15b963721bdd8d7fe548b44a4f5d84547666b Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 27 Jun 2011 08:07:15 +0000 Subject: [PATCH] LUCENE-3216: keep float doc values in memory during indexing while merge directly to the target file git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1140044 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/index/values/Bytes.java | 2 +- .../apache/lucene/index/values/Floats.java | 141 ++++++++++++------ .../apache/lucene/index/values/IntsImpl.java | 2 +- .../apache/lucene/index/values/Writer.java | 6 +- 4 files changed, 101 insertions(+), 50 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/index/values/Bytes.java b/lucene/src/java/org/apache/lucene/index/values/Bytes.java index 0944fc9ee4c..d566d96653c 100644 --- a/lucene/src/java/org/apache/lucene/index/values/Bytes.java +++ b/lucene/src/java/org/apache/lucene/index/values/Bytes.java @@ -387,7 +387,7 @@ public final class Bytes { } @Override - protected void add(int docID) throws IOException { + protected void mergeDoc(int docID) throws IOException { add(docID, bytesRef); } diff --git a/lucene/src/java/org/apache/lucene/index/values/Floats.java b/lucene/src/java/org/apache/lucene/index/values/Floats.java index e4200b6ab3d..7fe03b29161 100644 --- a/lucene/src/java/org/apache/lucene/index/values/Floats.java +++ b/lucene/src/java/org/apache/lucene/index/values/Floats.java @@ -25,10 +25,12 @@ import org.apache.lucene.index.values.IndexDocValues.Source; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.FloatsRef; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; /** * Exposes {@link Writer} and reader ({@link Source}) for 32 bit and 64 bit @@ -44,11 +46,7 @@ public class Floats { private static final String CODEC_NAME = "SimpleFloats"; static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; - private static final int INT_DEFAULT = Float - .floatToRawIntBits(0.0f); - private static final long LONG_DEFAULT = Double - .doubleToRawLongBits(0.0d); - + private static final byte[] DEFAULTS = new byte[] {0,0,0,0,0,0,0,0}; public static Writer getWriter(Directory dir, String id, int precisionBytes, AtomicLong bytesUsed) throws IOException { @@ -70,16 +68,27 @@ public class Floats { abstract static class FloatsWriter extends Writer { private final String id; - private FloatsRef floatsRef; + protected FloatsRef floatsRef; protected int lastDocId = -1; protected IndexOutput datOut; private final byte precision; + private final Directory dir; protected FloatsWriter(Directory dir, String id, int precision, AtomicLong bytesUsed) throws IOException { super(bytesUsed); this.id = id; this.precision = (byte) precision; + this.dir = dir; + + } + + public long ramBytesUsed() { + return 0; + } + + final void initDataOut() throws IOException { + assert datOut == null; datOut = dir.createOutput(IndexFileNames.segmentFileName(id, "", Writer.DATA_EXTENSION)); boolean success = false; @@ -95,12 +104,8 @@ public class Floats { } } - public long ramBytesUsed() { - return 0; - } - @Override - protected void add(int docID) throws IOException { + protected void mergeDoc(int docID) throws IOException { add(docID, floatsRef.get()); } @@ -114,13 +119,20 @@ public class Floats { floatsRef = valuesEnum.getFloat(); } - protected abstract int fillDefault(int num) throws IOException; + protected final int fillDefault(int numValues) throws IOException { + for (int i = 0; i < numValues; i++) { + datOut.writeBytes(DEFAULTS, precision); + } + return numValues; + } @Override protected void merge(MergeState state) throws IOException { + if (datOut == null) { + initDataOut(); + } if (state.bits == null && state.reader instanceof FloatsReader) { // no deletes - bulk copy - // TODO: should be do bulks with deletes too? final FloatsReader reader = (FloatsReader) state.reader; assert reader.precisionBytes == (int) precision; if (reader.maxDoc == 0) @@ -131,8 +143,10 @@ public class Floats { lastDocId += fillDefault(docBase - lastDocId - 1); } lastDocId += reader.transferTo(datOut); - } else - super.merge(state); + } else { + super.merge(state); + } + } @Override @@ -143,10 +157,12 @@ public class Floats { // Writes 4 bytes (float) per value static class Float4Writer extends FloatsWriter { - + private int[] values; protected Float4Writer(Directory dir, String id, AtomicLong bytesUsed) throws IOException { super(dir, id, 4, bytesUsed); + values = new int[1]; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT); } @Override @@ -154,76 +170,111 @@ public class Floats { throws IOException { assert docID > lastDocId : "docID: " + docID + " must be greater than the last added doc id: " + lastDocId; + if (docID >= values.length) { + final long len = values.length; + values = ArrayUtil.grow(values, 1 + docID); + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT + * ((values.length) - len)); + } + values[docID] = Float.floatToRawIntBits((float)v); + lastDocId = docID; + } + + @Override + protected void mergeDoc(int docID) throws IOException { + assert datOut != null; + assert docID > lastDocId : "docID: " + docID + + " must be greater than the last added doc id: " + lastDocId; if (docID - lastDocId > 1) { // fill with default values - lastDocId += fillDefault(docID - lastDocId - 1); + fillDefault(docID - lastDocId - 1); } assert datOut != null; - datOut.writeInt(Float.floatToRawIntBits((float) v)); - ++lastDocId; + datOut.writeInt(Float.floatToRawIntBits((float) floatsRef.get())); + lastDocId = docID; } @Override public void finish(int docCount) throws IOException { + boolean success = false; try { - if (docCount > lastDocId + 1) - for (int i = lastDocId; i < docCount; i++) { - datOut.writeInt(INT_DEFAULT); // default value + int numDefaultsToAppend = docCount - (lastDocId + 1); + if (datOut == null) { + initDataOut(); + for (int i = 0; i <= lastDocId; i++) { + datOut.writeInt(values[i]); } + } + fillDefault(numDefaultsToAppend); + success = true; } finally { - datOut.close(); + bytesUsed.addAndGet(-(RamUsageEstimator.NUM_BYTES_INT + * ((values.length)))); + values = null; + IOUtils.closeSafely(!success, datOut); } } - @Override - protected int fillDefault(int numValues) throws IOException { - for (int i = 0; i < numValues; i++) { - datOut.writeInt(INT_DEFAULT); - } - return numValues; - } + } // Writes 8 bytes (double) per value static class Float8Writer extends FloatsWriter { - + private long[] values; protected Float8Writer(Directory dir, String id, AtomicLong bytesUsed) throws IOException { super(dir, id, 8, bytesUsed); + values = new long[1]; + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_LONG); } @Override public void add(int docID, double v) throws IOException { assert docID > lastDocId : "docID: " + docID + " must be greater than the last added doc id: " + lastDocId; + if (docID >= values.length) { + final long len = values.length; + values = ArrayUtil.grow(values, 1 + docID); + bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_LONG + * ((values.length) - len)); + } + values[docID] = Double.doubleToLongBits(v); + lastDocId = docID; + } + + @Override + protected void mergeDoc(int docID) throws IOException { + assert docID > lastDocId : "docID: " + docID + + " must be greater than the last added doc id: " + lastDocId; if (docID - lastDocId > 1) { // fill with default values lastDocId += fillDefault(docID - lastDocId - 1); } assert datOut != null; - datOut.writeLong(Double.doubleToRawLongBits(v)); - ++lastDocId; + datOut.writeLong(Double.doubleToRawLongBits((float) floatsRef.get())); + lastDocId = docID; } @Override public void finish(int docCount) throws IOException { + boolean success = false; try { - if (docCount > lastDocId + 1) - for (int i = lastDocId; i < docCount; i++) { - datOut.writeLong(LONG_DEFAULT); // default value + int numDefaultsToAppend = docCount - (lastDocId + 1); + if (datOut == null) { + initDataOut(); + for (int i = 0; i <= lastDocId; i++) { + datOut.writeLong(values[i]); } + } + fillDefault(numDefaultsToAppend); + success = true; } finally { - datOut.close(); + bytesUsed.addAndGet(-(RamUsageEstimator.NUM_BYTES_LONG + * ((values.length)))); + values = null; + IOUtils.closeSafely(!success, datOut); } } - - @Override - protected int fillDefault(int numValues) throws IOException { - for (int i = 0; i < numValues; i++) { - datOut.writeLong(LONG_DEFAULT); - } - return numValues; - } } /** diff --git a/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java b/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java index dc626e6fa84..4921f4b777d 100644 --- a/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java +++ b/lucene/src/java/org/apache/lucene/index/values/IntsImpl.java @@ -157,7 +157,7 @@ class IntsImpl { } @Override - protected void add(int docID) throws IOException { + protected void mergeDoc(int docID) throws IOException { add(docID, intsRef.get()); } diff --git a/lucene/src/java/org/apache/lucene/index/values/Writer.java b/lucene/src/java/org/apache/lucene/index/values/Writer.java index e3444545773..bdb7d65dc7f 100644 --- a/lucene/src/java/org/apache/lucene/index/values/Writer.java +++ b/lucene/src/java/org/apache/lucene/index/values/Writer.java @@ -113,11 +113,11 @@ public abstract class Writer extends DocValuesConsumer { * the {@link Writer} implementation. The given document ID must always be * greater than the previous ID or 0 if called the first time. */ - protected abstract void add(int docID) throws IOException; + protected abstract void mergeDoc(int docID) throws IOException; /** * Sets the next {@link ValuesEnum} to consume values from on calls to - * {@link #add(int)} + * {@link #mergeDoc(int)} * * @param valuesEnum * the next {@link ValuesEnum}, this must not be null @@ -159,7 +159,7 @@ public abstract class Writer extends DocValuesConsumer { } } if (currentDocId == i) { // we are on the doc to merge - add(docID); + mergeDoc(docID); } ++docID; }