diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 10ff754b55b..beb50cb4411 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -548,6 +548,10 @@ New features * LUCENE-2526: Don't throw NPE from MultiPhraseQuery.toString when it's empty. (Ross Woolf via Mike McCandless) + +* LUCENE-2589: Add a VariableSizedIntIndexInput, which, when used w/ + Sep*, makes it simple to take any variable sized int block coders + (like Simple9/16) and use them in a codec. (Mike McCandless) Optimizations diff --git a/lucene/src/java/org/apache/lucene/index/codecs/Codec.java b/lucene/src/java/org/apache/lucene/index/codecs/Codec.java index 42984ee33e3..bbb03d3b673 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/Codec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/Codec.java @@ -56,4 +56,9 @@ public abstract class Codec { /** Records all file extensions this codec uses */ public abstract void getExtensions(Set extensions); + + @Override + public String toString() { + return name; + } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java index 4c71c0c0b12..a8a7b23c509 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java @@ -115,7 +115,7 @@ class DefaultCodecProvider extends CodecProvider { DefaultCodecProvider() { register(new StandardCodec()); register(new PreFlexCodec()); - register(new PulsingCodec()); + register(new PulsingCodec(1)); } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java index 42636f46c8f..12e7a8c6b0d 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java @@ -37,10 +37,10 @@ import org.apache.lucene.util.IntsRef; */ public abstract class FixedIntBlockIndexInput extends IntIndexInput { - private IndexInput in; - protected int blockSize; - - protected void init(final IndexInput in) throws IOException { + private final IndexInput in; + protected final int blockSize; + + public FixedIntBlockIndexInput(final IndexInput in) throws IOException { this.in = in; blockSize = in.readVInt(); } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java index 65e78d12096..c77c6d39163 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java @@ -36,19 +36,19 @@ import org.apache.lucene.store.IndexOutput; */ public abstract class FixedIntBlockIndexOutput extends IntIndexOutput { - private IndexOutput out; - private int blockSize; - private int[] pending; + protected final IndexOutput out; + private final int blockSize; + protected final int[] buffer; private int upto; - protected void init(IndexOutput out, int fixedBlockSize) throws IOException { + protected FixedIntBlockIndexOutput(IndexOutput out, int fixedBlockSize) throws IOException { blockSize = fixedBlockSize; - out.writeVInt(blockSize); this.out = out; - pending = new int[blockSize]; + out.writeVInt(blockSize); + buffer = new int[blockSize]; } - protected abstract void flushBlock(int[] buffer, IndexOutput out) throws IOException; + protected abstract void flushBlock() throws IOException; @Override public Index index() throws IOException { @@ -96,9 +96,9 @@ public abstract class FixedIntBlockIndexOutput extends IntIndexOutput { @Override public void write(int v) throws IOException { - pending[upto++] = v; + buffer[upto++] = v; if (upto == blockSize) { - flushBlock(pending, out); + flushBlock(); upto = 0; } } @@ -107,9 +107,9 @@ public abstract class FixedIntBlockIndexOutput extends IntIndexOutput { public void close() throws IOException { try { if (upto > 0) { - // NOTE: entries in the block after current upto are - // invalid - flushBlock(pending, out); + // NOTE: entries in the block after current upto are + // invalid + flushBlock(); } } finally { out.close(); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java new file mode 100644 index 00000000000..8883399ffe4 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java @@ -0,0 +1,216 @@ +package org.apache.lucene.index.codecs.intblock; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Naive int block API that writes vInts. This is + * expected to give poor performance; it's really only for + * testing the pluggability. One should typically use pfor instead. */ + +import java.io.IOException; + +import org.apache.lucene.index.codecs.sep.IntIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IntsRef; + +// TODO: much of this can be shared code w/ the fixed case + +/** Abstract base class that reads variable-size blocks of ints + * from an IndexInput. While this is a simple approach, a + * more performant approach would directly create an impl + * of IntIndexInput inside Directory. Wrapping a generic + * IndexInput will likely cost performance. + * + * @lucene.experimental + */ +public abstract class VariableIntBlockIndexInput extends IntIndexInput { + + protected final IndexInput in; + protected final int maxBlockSize; + + protected VariableIntBlockIndexInput(final IndexInput in) throws IOException { + this.in = in; + maxBlockSize = in.readInt(); + } + + @Override + public Reader reader() throws IOException { + final int[] buffer = new int[maxBlockSize]; + final IndexInput clone = (IndexInput) in.clone(); + // TODO: can this be simplified? + return new Reader(clone, buffer, this.getBlockReader(clone, buffer)); + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public Index index() { + return new Index(); + } + + protected abstract BlockReader getBlockReader(IndexInput in, int[] buffer) throws IOException; + + public interface BlockReader { + public int readBlock() throws IOException; + public void seek(long pos) throws IOException; + } + + public static class Reader extends IntIndexInput.Reader { + private final IndexInput in; + + public final int[] pending; + int upto; + + private boolean seekPending; + private long pendingFP; + private int pendingUpto; + private long lastBlockFP; + private int blockSize; + private final BlockReader blockReader; + private final IntsRef bulkResult = new IntsRef(); + + public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) + throws IOException { + this.in = in; + this.pending = pending; + bulkResult.ints = pending; + this.blockReader = blockReader; + } + + void seek(final long fp, final int upto) throws IOException { + // TODO: should we do this in real-time, not lazy? + pendingFP = fp; + pendingUpto = upto; + assert pendingUpto >= 0: "pendingUpto=" + pendingUpto; + seekPending = true; + } + + private final void maybeSeek() throws IOException { + if (seekPending) { + if (pendingFP != lastBlockFP) { + // need new block + in.seek(pendingFP); + blockReader.seek(pendingFP); + lastBlockFP = pendingFP; + blockSize = blockReader.readBlock(); + } + upto = pendingUpto; + + // TODO: if we were more clever when writing the + // index, such that a seek point wouldn't be written + // until the int encoder "committed", we could avoid + // this (likely minor) inefficiency: + + // This is necessary for int encoders that are + // non-causal, ie must see future int values to + // encode the current ones. + while(upto >= blockSize) { + upto -= blockSize; + lastBlockFP = in.getFilePointer(); + blockSize = blockReader.readBlock(); + } + seekPending = false; + } + } + + @Override + public int next() throws IOException { + this.maybeSeek(); + if (upto == blockSize) { + lastBlockFP = in.getFilePointer(); + blockSize = blockReader.readBlock(); + upto = 0; + } + + return pending[upto++]; + } + + @Override + public IntsRef read(final int count) throws IOException { + this.maybeSeek(); + if (upto == blockSize) { + lastBlockFP = in.getFilePointer(); + blockSize = blockReader.readBlock(); + upto = 0; + } + bulkResult.offset = upto; + if (upto + count < blockSize) { + bulkResult.length = count; + upto += count; + } else { + bulkResult.length = blockSize - upto; + upto = blockSize; + } + + return bulkResult; + } + } + + private class Index extends IntIndexInput.Index { + private long fp; + private int upto; + + @Override + public void read(final IndexInput indexIn, final boolean absolute) throws IOException { + if (absolute) { + fp = indexIn.readVLong(); + upto = indexIn.readByte()&0xFF; + } else { + final long delta = indexIn.readVLong(); + if (delta == 0) { + // same block + upto = indexIn.readByte()&0xFF; + } else { + // new block + fp += delta; + upto = indexIn.readByte()&0xFF; + } + } + // TODO: we can't do this assert because non-causal + // int encoders can have upto over the buffer size + //assert upto < maxBlockSize: "upto=" + upto + " max=" + maxBlockSize; + } + + @Override + public String toString() { + return "VarIntBlock.Index fp=" + fp + " upto=" + upto + " maxBlock=" + maxBlockSize; + } + + @Override + public void seek(final IntIndexInput.Reader other) throws IOException { + ((Reader) other).seek(fp, upto); + } + + @Override + public void set(final IntIndexInput.Index other) { + final Index idx = (Index) other; + fp = idx.fp; + upto = idx.upto; + } + + @Override + public Object clone() { + Index other = new Index(); + other.fp = fp; + other.upto = upto; + return other; + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java new file mode 100644 index 00000000000..1dfbf32b3c2 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexOutput.java @@ -0,0 +1,128 @@ +package org.apache.lucene.index.codecs.intblock; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Naive int block API that writes vInts. This is + * expected to give poor performance; it's really only for + * testing the pluggability. One should typically use pfor instead. */ + +import java.io.IOException; + +import org.apache.lucene.index.codecs.sep.IntIndexOutput; +import org.apache.lucene.store.IndexOutput; + +// TODO: much of this can be shared code w/ the fixed case + +/** Abstract base class that writes variable-size blocks of ints + * to an IndexOutput. While this is a simple approach, a + * more performant approach would directly create an impl + * of IntIndexOutput inside Directory. Wrapping a generic + * IndexInput will likely cost performance. + * + * @lucene.experimental + */ +public abstract class VariableIntBlockIndexOutput extends IntIndexOutput { + + protected final IndexOutput out; + + private int upto; + + private static final int MAX_BLOCK_SIZE = 1 << 8; + + /** NOTE: maxBlockSize plus the max non-causal lookahead + * of your codec must be less than 256. EG Simple9 + * requires lookahead=1 because on seeing the Nth value + * it knows it must now encode the N-1 values before it. */ + protected VariableIntBlockIndexOutput(IndexOutput out, int maxBlockSize) throws IOException { + if (maxBlockSize > MAX_BLOCK_SIZE) { + throw new IllegalArgumentException("maxBlockSize must be <= " + MAX_BLOCK_SIZE + "; got " + maxBlockSize); + } + this.out = out; + out.writeInt(maxBlockSize); + } + + /** Called one value at a time. Return the number of + * buffered input values that have been written to out. */ + protected abstract int add(int value) throws IOException; + + @Override + public Index index() throws IOException { + return new Index(); + } + + private class Index extends IntIndexOutput.Index { + long fp; + int upto; + long lastFP; + int lastUpto; + + @Override + public void mark() throws IOException { + fp = out.getFilePointer(); + upto = VariableIntBlockIndexOutput.this.upto; + } + + @Override + public void set(IntIndexOutput.Index other) throws IOException { + Index idx = (Index) other; + lastFP = fp = idx.fp; + lastUpto = upto = idx.upto; + } + + @Override + public void write(IndexOutput indexOut, boolean absolute) throws IOException { + assert upto >= 0; + if (absolute) { + indexOut.writeVLong(fp); + indexOut.writeByte((byte) upto); + } else if (fp == lastFP) { + // same block + indexOut.writeVLong(0); + assert upto >= lastUpto; + indexOut.writeByte((byte) upto); + } else { + // new block + indexOut.writeVLong(fp - lastFP); + indexOut.writeByte((byte) upto); + } + lastUpto = upto; + lastFP = fp; + } + } + + @Override + public void write(int v) throws IOException { + upto -= add(v)-1; + assert upto >= 0; + } + + @Override + public void close() throws IOException { + try { + // stuff 0s in until the "real" data is flushed: + int stuffed = 0; + while(upto > stuffed) { + upto -= add(0)-1; + assert upto >= 0; + stuffed += 1; + } + } finally { + out.close(); + } + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java index ff5568981f2..f01f1e70d27 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java @@ -50,8 +50,18 @@ import org.apache.lucene.util.BytesRef; public class PulsingCodec extends Codec { - public PulsingCodec() { + private final int freqCutoff; + + /** Terms with freq <= freqCutoff are inlined into terms + * dict. */ + public PulsingCodec(int freqCutoff) { name = "Pulsing"; + this.freqCutoff = freqCutoff; + } + + @Override + public String toString() { + return name + "(freqCutoff=" + freqCutoff + ")"; } @Override @@ -62,7 +72,6 @@ public class PulsingCodec extends Codec { // Terms that have <= freqCutoff number of docs are // "pulsed" (inlined): - final int freqCutoff = 1; StandardPostingsWriter pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter); // Terms dict index diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java index 5b40d4a1975..fe0097207e1 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java @@ -169,6 +169,11 @@ public class SepPostingsReaderImpl extends StandardPostingsReader { skipOffset = other.skipOffset; payloadOffset = other.payloadOffset; } + + @Override + public String toString() { + return "tis.fp=" + filePointer + " docFreq=" + docFreq + " ord=" + ord + " docIndex=" + docIndex; + } } @Override @@ -629,12 +634,10 @@ public class SepPostingsReaderImpl extends StandardPostingsReader { // positions while (pendingPosCount > freq) { final int code = posReader.next(); - if (storePayloads) { - if ((code & 1) != 0) { - // Payload length has changed - payloadLength = posReader.next(); - assert payloadLength >= 0; - } + if (storePayloads && (code & 1) != 0) { + // Payload length has changed + payloadLength = posReader.next(); + assert payloadLength >= 0; } pendingPosCount--; payloadPending = true; diff --git a/lucene/src/java/org/apache/lucene/util/CodecUtil.java b/lucene/src/java/org/apache/lucene/util/CodecUtil.java index 5e4c1d87b88..1ee912edd1f 100644 --- a/lucene/src/java/org/apache/lucene/util/CodecUtil.java +++ b/lucene/src/java/org/apache/lucene/util/CodecUtil.java @@ -33,7 +33,7 @@ import java.io.IOException; public final class CodecUtil { private final static int CODEC_MAGIC = 0x3fd76c17; - public static void writeHeader(IndexOutput out, String codec, int version) + public static IndexOutput writeHeader(IndexOutput out, String codec, int version) throws IOException { final long start = out.getFilePointer(); out.writeInt(CODEC_MAGIC); @@ -44,6 +44,8 @@ public final class CodecUtil { if (out.getFilePointer()-start != codec.length()+9) { throw new IllegalArgumentException("codec must be simple ASCII, less than 128 characters in length [got " + codec + "]"); } + + return out; } public static int headerLength(String codec) { diff --git a/lucene/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/src/test/org/apache/lucene/index/TestCodecs.java index c22a68e2bef..a53e0b68290 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/src/test/org/apache/lucene/index/TestCodecs.java @@ -268,7 +268,7 @@ public class TestCodecs extends MultiCodecTestCase { final int NUM_TERMS = 100; final TermData[] terms = new TermData[NUM_TERMS]; for(int i=0;i= count: "buffer.length=" + buffer.length + " count=" + count; + for(int i=0;i files) { + SepPostingsReaderImpl.files(segmentInfo, files); + StandardTermsDictReader.files(dir, segmentInfo, files); + SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + } + + @Override + public void getExtensions(Set extensions) { + SepPostingsWriterImpl.getExtensions(extensions); + StandardTermsDictReader.getExtensions(extensions); + SimpleStandardTermsIndexReader.getIndexExtensions(extensions); + } +} diff --git a/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java b/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java index e284167878e..57573fe8c8d 100644 --- a/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java @@ -32,6 +32,7 @@ import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache.CacheEntry; @@ -83,7 +84,7 @@ public abstract class LuceneTestCase extends TestCase { private volatile Thread.UncaughtExceptionHandler savedUncaughtExceptionHandler = null; - private String codec; + private Codec codec; /** Used to track if setUp and tearDown are called correctly from subclasses */ private boolean setup; @@ -307,9 +308,7 @@ public abstract class LuceneTestCase extends TestCase { seed = null; super.runBare(); } catch (Throwable e) { - if (TEST_CODEC.equals("random")) { - System.out.println("NOTE: random codec of testcase '" + getName() + "' was: " + codec); - } + System.out.println("NOTE: random codec of testcase '" + getName() + "' was: " + codec); if (seed != null) { System.out.println("NOTE: random seed of testcase '" + getName() + "' was: " + seed); } diff --git a/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java b/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java index c16cf2a1fec..1c76be7ad13 100644 --- a/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java +++ b/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java @@ -31,8 +31,10 @@ import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; import org.apache.lucene.index.codecs.preflex.PreFlexCodec; +import org.apache.lucene.index.codecs.pulsing.PulsingCodec; import org.apache.lucene.index.codecs.mocksep.MockSepCodec; import org.apache.lucene.index.codecs.mockintblock.MockFixedIntBlockCodec; +import org.apache.lucene.index.codecs.mockintblock.MockVariableIntBlockCodec; import org.junit.After; import org.junit.AfterClass; @@ -154,9 +156,9 @@ public class LuceneTestCaseJ4 { // saves default codec: we do this statically as many build indexes in @beforeClass private static String savedDefaultCodec; - private static String codec; + private static Codec codec; - private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock"}; + private static final String[] TEST_CODECS = new String[] {"MockSep", "MockFixedIntBlock", "MockVariableIntBlock"}; private static void swapCodec(Codec c) { final CodecProvider cp = CodecProvider.getDefault(); @@ -172,7 +174,7 @@ public class LuceneTestCaseJ4 { } // returns current default codec - static String installTestCodecs() { + static Codec installTestCodecs() { final CodecProvider cp = CodecProvider.getDefault(); savedDefaultCodec = CodecProvider.getDefaultCodec(); @@ -190,15 +192,18 @@ public class LuceneTestCaseJ4 { } swapCodec(new MockSepCodec()); - swapCodec(new MockFixedIntBlockCodec()); + swapCodec(new PulsingCodec(_TestUtil.nextInt(seedRnd, 1, 20))); + swapCodec(new MockFixedIntBlockCodec(_TestUtil.nextInt(seedRnd, 1, 2000))); + // baseBlockSize cannot be over 127: + swapCodec(new MockVariableIntBlockCodec(_TestUtil.nextInt(seedRnd, 1, 127))); - return codec; + return cp.lookup(codec); } // returns current PreFlex codec - static void removeTestCodecs(String codec) { + static void removeTestCodecs(Codec codec) { final CodecProvider cp = CodecProvider.getDefault(); - if (codec.equals("PreFlex")) { + if (codec.name.equals("PreFlex")) { final Codec preFlex = cp.lookup("PreFlex"); if (preFlex != null) { cp.unregister(preFlex); @@ -207,6 +212,8 @@ public class LuceneTestCaseJ4 { } cp.unregister(cp.lookup("MockSep")); cp.unregister(cp.lookup("MockFixedIntBlock")); + cp.unregister(cp.lookup("MockVariableIntBlock")); + swapCodec(new PulsingCodec(1)); CodecProvider.setDefaultCodec(savedDefaultCodec); } @@ -530,9 +537,7 @@ public class LuceneTestCaseJ4 { System.out.println("NOTE: random static seed of testclass '" + getName() + "' was: " + staticSeed); } - if (TEST_CODEC.equals("random")) { - System.out.println("NOTE: random codec of testcase '" + getName() + "' was: " + codec); - } + System.out.println("NOTE: random codec of testcase '" + getName() + "' was: " + codec); if (seed != null) { System.out.println("NOTE: random seed of testcase '" + getName() + "' was: " + seed);