diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 019a147ed53..411680e8385 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -89,6 +89,18 @@ New features http://blog.mikemccandless.com/2013/08/suggeststopfilter-carefully-removes.html for details. +* LUCENE-5178: Added support for missing values to DocValues fields. + AtomicReader.getDocsWithField returns a Bits of documents with a value, + and FieldCache.getDocsWithField forwards to that for DocValues fields. Things like + SortField.setMissingValue, FunctionValues.exists, and FieldValueFilter now + work with DocValues fields. (Robert Muir) + +* LUCENE-5124: Lucene 4.5 has a new Lucene45Codec with Lucene45DocValues, + supporting missing values and with most datastructures residing off-heap. + Added "Memory" docvalues format that works entirely in heap, and "Disk" + loads no datastructures into RAM. Both of these also support missing values. + Added DiskNormsFormat (in case you want norms entirely on disk). (Robert Muir) + Bug Fixes * LUCENE-5116: IndexWriter.addIndexes(IndexReader...) should drop empty (or all @@ -184,6 +196,15 @@ API Changes * LUCENE-5173: Lucene never writes segments with 0 documents anymore. (Shai Erera, Uwe Schindler, Robert Muir) +* LUCENE-5178: SortedDocValues always returns -1 ord when a document is missing + a value for the field. Previously it only did this if the SortedDocValues + was produced by uninversion on the FieldCache. (Robert Muir) + +Changes in Runtime Behavior + +* LUCENE-5178: DocValues codec consumer APIs (iterables) return null values + when the document has no value for the field. (Robert Muir) + Optimizations * LUCENE-5088: Added TermFilter to filter docs by a specific term. diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java index 43a7d57eecf..f3fd35e6554 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java @@ -22,8 +22,11 @@ import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer; +import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.BytesRef; /** * DocValues format that keeps most things on disk. @@ -40,7 +43,12 @@ public final class DiskDocValuesFormat extends DocValuesFormat { @Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return new DiskDocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + return new Lucene45DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION) { + @Override + protected void addTermsDict(FieldInfo field, Iterable values) throws IOException { + addBinaryField(field, values); + } + }; } @Override @@ -52,11 +60,4 @@ public final class DiskDocValuesFormat extends DocValuesFormat { public static final String DATA_EXTENSION = "dvdd"; public static final String META_CODEC = "DiskDocValuesMetadata"; public static final String META_EXTENSION = "dvdm"; - public static final int VERSION_START = 0; - public static final int VERSION_COMPRESSED_TERMS = 1; - public static final int VERSION_CURRENT = VERSION_COMPRESSED_TERMS; - public static final byte NUMERIC = 0; - public static final byte BINARY = 1; - public static final byte SORTED = 2; - public static final byte SORTED_SET = 3; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java index c100b84142c..4972706ed67 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java @@ -17,726 +17,34 @@ package org.apache.lucene.codecs.diskdv; * limitations under the License. */ -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.DELTA_COMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.GCD_COMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.TABLE_COMPRESSED; - -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_PREFIX_COMPRESSED; - import java.io.IOException; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Map; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesProducer; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SortedDocValues; -import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.packed.BlockPackedReader; import org.apache.lucene.util.packed.MonotonicBlockPackedReader; -import org.apache.lucene.util.packed.PackedInts; -class DiskDocValuesProducer extends DocValuesProducer { - private final Map numerics; - private final Map binaries; - private final Map ords; - private final Map ordIndexes; - private final IndexInput data; +class DiskDocValuesProducer extends Lucene45DocValuesProducer { - // memory-resident structures - private final Map addressInstances = new HashMap(); - private final Map ordIndexInstances = new HashMap(); - DiskDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { - String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); - // read in the entries from the metadata file. - IndexInput in = state.directory.openInput(metaName, state.context); - boolean success = false; - final int version; - try { - version = CodecUtil.checkHeader(in, metaCodec, - DiskDocValuesFormat.VERSION_CURRENT, - DiskDocValuesFormat.VERSION_CURRENT); - numerics = new HashMap(); - ords = new HashMap(); - ordIndexes = new HashMap(); - binaries = new HashMap(); - readFields(in, state.fieldInfos); - - success = true; - } finally { - if (success) { - IOUtils.close(in); - } else { - IOUtils.closeWhileHandlingException(in); - } - } - - success = false; - try { - String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); - data = state.directory.openInput(dataName, state.context); - final int version2 = CodecUtil.checkHeader(data, dataCodec, - DiskDocValuesFormat.VERSION_CURRENT, - DiskDocValuesFormat.VERSION_CURRENT); - if (version != version2) { - throw new CorruptIndexException("Format versions mismatch"); - } - - success = true; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(this.data); - } - } - } - - private void readFields(IndexInput meta, FieldInfos infos) throws IOException { - int fieldNumber = meta.readVInt(); - while (fieldNumber != -1) { - byte type = meta.readByte(); - if (type == DiskDocValuesFormat.NUMERIC) { - numerics.put(fieldNumber, readNumericEntry(meta)); - } else if (type == DiskDocValuesFormat.BINARY) { - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - } else if (type == DiskDocValuesFormat.SORTED) { - // sorted = binary + numeric - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); - } - if (meta.readByte() != DiskDocValuesFormat.BINARY) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); - } - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); - } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); - } - NumericEntry n = readNumericEntry(meta); - ords.put(fieldNumber, n); - } else if (type == DiskDocValuesFormat.SORTED_SET) { - // sortedset = binary + numeric + ordIndex - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); - } - if (meta.readByte() != DiskDocValuesFormat.BINARY) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); - } - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); - } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); - } - NumericEntry n1 = readNumericEntry(meta); - ords.put(fieldNumber, n1); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); - } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); - } - NumericEntry n2 = readNumericEntry(meta); - ordIndexes.put(fieldNumber, n2); - } else { - throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta); - } - fieldNumber = meta.readVInt(); - } - } - - static NumericEntry readNumericEntry(IndexInput meta) throws IOException { - NumericEntry entry = new NumericEntry(); - entry.format = meta.readVInt(); - entry.packedIntsVersion = meta.readVInt(); - entry.offset = meta.readLong(); - entry.count = meta.readVLong(); - entry.blockSize = meta.readVInt(); - switch(entry.format) { - case GCD_COMPRESSED: - entry.minValue = meta.readLong(); - entry.gcd = meta.readLong(); - break; - case TABLE_COMPRESSED: - if (entry.count > Integer.MAX_VALUE) { - throw new CorruptIndexException("Cannot use TABLE_COMPRESSED with more than MAX_VALUE values, input=" + meta); - } - final int uniqueValues = meta.readVInt(); - if (uniqueValues > 256) { - throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + meta); - } - entry.table = new long[uniqueValues]; - for (int i = 0; i < uniqueValues; ++i) { - entry.table[i] = meta.readLong(); - } - break; - case DELTA_COMPRESSED: - break; - default: - throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); - } - return entry; - } - - static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException { - BinaryEntry entry = new BinaryEntry(); - entry.format = meta.readVInt(); - entry.minLength = meta.readVInt(); - entry.maxLength = meta.readVInt(); - entry.count = meta.readVLong(); - entry.offset = meta.readLong(); - switch(entry.format) { - case BINARY_FIXED_UNCOMPRESSED: - break; - case BINARY_PREFIX_COMPRESSED: - entry.addressInterval = meta.readVInt(); - entry.addressesOffset = meta.readLong(); - entry.packedIntsVersion = meta.readVInt(); - entry.blockSize = meta.readVInt(); - break; - case BINARY_VARIABLE_UNCOMPRESSED: - entry.addressesOffset = meta.readLong(); - entry.packedIntsVersion = meta.readVInt(); - entry.blockSize = meta.readVInt(); - break; - default: - throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); - } - return entry; + super(state, dataCodec, dataExtension, metaCodec, metaExtension); } @Override - public NumericDocValues getNumeric(FieldInfo field) throws IOException { - NumericEntry entry = numerics.get(field.number); - return getNumeric(entry); + protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException { + data.seek(bytes.addressesOffset); + return new MonotonicBlockPackedReader(data.clone(), bytes.packedIntsVersion, bytes.blockSize, bytes.count, true); } - - LongNumericDocValues getNumeric(NumericEntry entry) throws IOException { - final IndexInput data = this.data.clone(); + + @Override + protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException { + throw new AssertionError(); + } + + @Override + protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException { data.seek(entry.offset); - - switch (entry.format) { - case DELTA_COMPRESSED: - final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - return new LongNumericDocValues() { - @Override - public long get(long id) { - return reader.get(id); - } - }; - case GCD_COMPRESSED: - final long min = entry.minValue; - final long mult = entry.gcd; - final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - return new LongNumericDocValues() { - @Override - public long get(long id) { - return min + mult * quotientReader.get(id); - } - }; - case TABLE_COMPRESSED: - final long table[] = entry.table; - final int bitsRequired = PackedInts.bitsRequired(table.length - 1); - final PackedInts.Reader ords = PackedInts.getDirectReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, (int) entry.count, bitsRequired); - return new LongNumericDocValues() { - @Override - public long get(long id) { - return table[(int) ords.get((int) id)]; - } - }; - default: - throw new AssertionError(); - } - } - - @Override - public BinaryDocValues getBinary(FieldInfo field) throws IOException { - BinaryEntry bytes = binaries.get(field.number); - switch(bytes.format) { - case BINARY_FIXED_UNCOMPRESSED: - return getFixedBinary(field, bytes); - case BINARY_VARIABLE_UNCOMPRESSED: - return getVariableBinary(field, bytes); - case BINARY_PREFIX_COMPRESSED: - return getCompressedBinary(field, bytes); - default: - throw new AssertionError(); - } - } - - private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) { - final IndexInput data = this.data.clone(); - - return new LongBinaryDocValues() { - @Override - public void get(long id, BytesRef result) { - long address = bytes.offset + id * bytes.maxLength; - try { - data.seek(address); - // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) - // assume "they" own the bytes after calling this! - final byte[] buffer = new byte[bytes.maxLength]; - data.readBytes(buffer, 0, buffer.length); - result.bytes = buffer; - result.offset = 0; - result.length = buffer.length; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; - } - - private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final IndexInput data = this.data.clone(); - - final MonotonicBlockPackedReader addresses; - synchronized (addressInstances) { - MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number); - if (addrInstance == null) { - data.seek(bytes.addressesOffset); - addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false); - addressInstances.put(field.number, addrInstance); - } - addresses = addrInstance; - } - - return new LongBinaryDocValues() { - @Override - public void get(long id, BytesRef result) { - long startAddress = bytes.offset + (id == 0 ? 0 : addresses.get(id-1)); - long endAddress = bytes.offset + addresses.get(id); - int length = (int) (endAddress - startAddress); - try { - data.seek(startAddress); - // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) - // assume "they" own the bytes after calling this! - final byte[] buffer = new byte[length]; - data.readBytes(buffer, 0, buffer.length); - result.bytes = buffer; - result.offset = 0; - result.length = length; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; - } - - private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final IndexInput data = this.data.clone(); - final long interval = bytes.addressInterval; - - final MonotonicBlockPackedReader addresses; - synchronized (addressInstances) { - MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number); - if (addrInstance == null) { - data.seek(bytes.addressesOffset); - final long size; - if (bytes.count % interval == 0) { - size = bytes.count / interval; - } else { - size = 1L + bytes.count / interval; - } - addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false); - addressInstances.put(field.number, addrInstance); - } - addresses = addrInstance; - } - - return new CompressedBinaryDocValues(bytes, addresses, data); - } - - @Override - public SortedDocValues getSorted(FieldInfo field) throws IOException { - final int valueCount = (int) binaries.get(field.number).count; - final BinaryDocValues binary = getBinary(field); - NumericEntry entry = ords.get(field.number); - IndexInput data = this.data.clone(); - data.seek(entry.offset); - final BlockPackedReader ordinals = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - - return new SortedDocValues() { - - @Override - public int getOrd(int docID) { - return (int) ordinals.get(docID); - } - - @Override - public void lookupOrd(int ord, BytesRef result) { - binary.get(ord, result); - } - - @Override - public int getValueCount() { - return valueCount; - } - - @Override - public int lookupTerm(BytesRef key) { - if (binary instanceof CompressedBinaryDocValues) { - return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key); - } else { - return super.lookupTerm(key); - } - } - - @Override - public TermsEnum termsEnum() { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues)binary).getTermsEnum(); - } else { - return super.termsEnum(); - } - } - }; - } - - @Override - public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - final long valueCount = binaries.get(field.number).count; - // we keep the byte[]s and list of ords on disk, these could be large - final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); - final LongNumericDocValues ordinals = getNumeric(ords.get(field.number)); - // but the addresses to the ord stream are in RAM - final MonotonicBlockPackedReader ordIndex; - synchronized (ordIndexInstances) { - MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number); - if (ordIndexInstance == null) { - NumericEntry entry = ordIndexes.get(field.number); - IndexInput data = this.data.clone(); - data.seek(entry.offset); - ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false); - ordIndexInstances.put(field.number, ordIndexInstance); - } - ordIndex = ordIndexInstance; - } - - return new SortedSetDocValues() { - long offset; - long endOffset; - - @Override - public long nextOrd() { - if (offset == endOffset) { - return NO_MORE_ORDS; - } else { - long ord = ordinals.get(offset); - offset++; - return ord; - } - } - - @Override - public void setDocument(int docID) { - offset = (docID == 0 ? 0 : ordIndex.get(docID-1)); - endOffset = ordIndex.get(docID); - } - - @Override - public void lookupOrd(long ord, BytesRef result) { - binary.get(ord, result); - } - - @Override - public long getValueCount() { - return valueCount; - } - - @Override - public long lookupTerm(BytesRef key) { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues)binary).lookupTerm(key); - } else { - return super.lookupTerm(key); - } - } - - @Override - public TermsEnum termsEnum() { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues)binary).getTermsEnum(); - } else { - return super.termsEnum(); - } - } - }; - } - - @Override - public void close() throws IOException { - data.close(); - } - - static class NumericEntry { - long offset; - - int format; - int packedIntsVersion; - long count; - int blockSize; - - long minValue; - long gcd; - long table[]; - } - - static class BinaryEntry { - long offset; - - int format; - long count; - int minLength; - int maxLength; - long addressesOffset; - long addressInterval; - int packedIntsVersion; - int blockSize; - } - - // internally we compose complex dv (sorted/sortedset) from other ones - static abstract class LongNumericDocValues extends NumericDocValues { - @Override - public final long get(int docID) { - return get((long) docID); - } - - abstract long get(long id); - } - - static abstract class LongBinaryDocValues extends BinaryDocValues { - @Override - public final void get(int docID, BytesRef result) { - get((long)docID, result); - } - - abstract void get(long id, BytesRef Result); - } - - // in the compressed case, we add a few additional operations for - // more efficient reverse lookup and enumeration - static class CompressedBinaryDocValues extends LongBinaryDocValues { - final BinaryEntry bytes; - final long interval; - final long numValues; - final long numIndexValues; - final MonotonicBlockPackedReader addresses; - final IndexInput data; - final TermsEnum termsEnum; - - public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, IndexInput data) throws IOException { - this.bytes = bytes; - this.interval = bytes.addressInterval; - this.addresses = addresses; - this.data = data; - this.numValues = bytes.count; - this.numIndexValues = addresses.size(); - this.termsEnum = getTermsEnum(data); - } - - @Override - public void get(long id, BytesRef result) { - try { - termsEnum.seekExact(id); - BytesRef term = termsEnum.term(); - result.bytes = term.bytes; - result.offset = term.offset; - result.length = term.length; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - long lookupTerm(BytesRef key) { - try { - SeekStatus status = termsEnum.seekCeil(key); - if (status == SeekStatus.END) { - return -numValues-1; - } else if (status == SeekStatus.FOUND) { - return termsEnum.ord(); - } else { - return -termsEnum.ord()-1; - } - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } - } - - TermsEnum getTermsEnum() { - try { - return getTermsEnum(data.clone()); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private TermsEnum getTermsEnum(final IndexInput input) throws IOException { - input.seek(bytes.offset); - - return new TermsEnum() { - private long currentOrd = -1; - // TODO: maxLength is negative when all terms are merged away... - private final BytesRef termBuffer = new BytesRef(bytes.maxLength < 0 ? 0 : bytes.maxLength); - private final BytesRef term = new BytesRef(); // TODO: paranoia? - - @Override - public BytesRef next() throws IOException { - if (doNext() == null) { - return null; - } else { - setTerm(); - return term; - } - } - - private BytesRef doNext() throws IOException { - if (++currentOrd >= numValues) { - return null; - } else { - int start = input.readVInt(); - int suffix = input.readVInt(); - input.readBytes(termBuffer.bytes, start, suffix); - termBuffer.length = start + suffix; - return termBuffer; - } - } - - @Override - public SeekStatus seekCeil(BytesRef text) throws IOException { - // binary-search just the index values to find the block, - // then scan within the block - long low = 0; - long high = numIndexValues-1; - - while (low <= high) { - long mid = (low + high) >>> 1; - doSeek(mid * interval); - int cmp = termBuffer.compareTo(text); - - if (cmp < 0) { - low = mid + 1; - } else if (cmp > 0) { - high = mid - 1; - } else { - // we got lucky, found an indexed term - setTerm(); - return SeekStatus.FOUND; - } - } - - if (numIndexValues == 0) { - return SeekStatus.END; - } - - // block before insertion point - long block = low-1; - doSeek(block < 0 ? -1 : block * interval); - - while (doNext() != null) { - int cmp = termBuffer.compareTo(text); - if (cmp == 0) { - setTerm(); - return SeekStatus.FOUND; - } else if (cmp > 0) { - setTerm(); - return SeekStatus.NOT_FOUND; - } - } - - return SeekStatus.END; - } - - @Override - public void seekExact(long ord) throws IOException { - doSeek(ord); - setTerm(); - } - - private void doSeek(long ord) throws IOException { - long block = ord / interval; - - if (ord >= currentOrd && block == currentOrd / interval) { - // seek within current block - } else { - // position before start of block - currentOrd = ord - ord % interval - 1; - input.seek(bytes.offset + addresses.get(block)); - } - - while (currentOrd < ord) { - doNext(); - } - } - - private void setTerm() { - // TODO: is there a cleaner way - term.bytes = new byte[termBuffer.length]; - term.offset = 0; - term.copyBytes(termBuffer); - } - - @Override - public BytesRef term() throws IOException { - return term; - } - - @Override - public long ord() throws IOException { - return currentOrd; - } - - @Override - public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUnicodeComparator(); - } - - @Override - public int docFreq() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long totalTermFreq() throws IOException { - return -1; - } - - @Override - public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } - }; - } + return new MonotonicBlockPackedReader(data.clone(), entry.packedIntsVersion, entry.blockSize, entry.count, true); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardNormsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskNormsFormat.java similarity index 70% rename from lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardNormsFormat.java rename to lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskNormsFormat.java index 5834f9c1f71..63a8ab9c3db 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardNormsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskNormsFormat.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.cheapbastard; +package org.apache.lucene.codecs.diskdv; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,25 +22,25 @@ import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.NormsFormat; -import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; /** Norms format that keeps all norms on disk */ -public final class CheapBastardNormsFormat extends NormsFormat { +public final class DiskNormsFormat extends NormsFormat { @Override public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException { - return new DiskDocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + return new Lucene45DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } @Override public DocValuesProducer normsProducer(SegmentReadState state) throws IOException { - return new CheapBastardDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + return new DiskDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } - static final String DATA_CODEC = "CheapBastardNormsData"; - static final String DATA_EXTENSION = "cbnd"; - static final String META_CODEC = "CheapBastardNormsMetadata"; - static final String META_EXTENSION = "cbnm"; + static final String DATA_CODEC = "DiskNormsData"; + static final String DATA_EXTENSION = "dnvd"; + static final String META_CODEC = "DiskNormsMetadata"; + static final String META_EXTENSION = "dnvm"; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java new file mode 100644 index 00000000000..2b3b9901fc5 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java @@ -0,0 +1,403 @@ +package org.apache.lucene.codecs.memory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.NoSuchElementException; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.MathUtil; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FST.INPUT_TYPE; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.BlockPackedWriter; +import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts.FormatAndBits; +import org.apache.lucene.util.packed.PackedInts; + +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.VERSION_CURRENT; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BLOCK_SIZE; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BYTES; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.NUMBER; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.FST; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.DELTA_COMPRESSED; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.GCD_COMPRESSED; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.TABLE_COMPRESSED; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.UNCOMPRESSED; + +/** + * Writer for {@link MemoryDocValuesFormat} + */ +class MemoryDocValuesConsumer extends DocValuesConsumer { + final IndexOutput data, meta; + final int maxDoc; + final float acceptableOverheadRatio; + + MemoryDocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension, float acceptableOverheadRatio) throws IOException { + this.acceptableOverheadRatio = acceptableOverheadRatio; + maxDoc = state.segmentInfo.getDocCount(); + boolean success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.createOutput(dataName, state.context); + CodecUtil.writeHeader(data, dataCodec, VERSION_CURRENT); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + meta = state.directory.createOutput(metaName, state.context); + CodecUtil.writeHeader(meta, metaCodec, VERSION_CURRENT); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void addNumericField(FieldInfo field, Iterable values) throws IOException { + addNumericField(field, values, true); + } + + void addNumericField(FieldInfo field, Iterable values, boolean optimizeStorage) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(NUMBER); + meta.writeLong(data.getFilePointer()); + long minValue = Long.MAX_VALUE; + long maxValue = Long.MIN_VALUE; + long gcd = 0; + boolean missing = false; + // TODO: more efficient? + HashSet uniqueValues = null; + if (optimizeStorage) { + uniqueValues = new HashSet<>(); + + long count = 0; + for (Number nv : values) { + final long v; + if (nv == null) { + v = 0; + missing = true; + } else { + v = nv.longValue(); + } + + if (gcd != 1) { + if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { + // in that case v - minValue might overflow and make the GCD computation return + // wrong results. Since these extreme values are unlikely, we just discard + // GCD computation for them + gcd = 1; + } else if (count != 0) { // minValue needs to be set first + gcd = MathUtil.gcd(gcd, v - minValue); + } + } + + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + + if (uniqueValues != null) { + if (uniqueValues.add(v)) { + if (uniqueValues.size() > 256) { + uniqueValues = null; + } + } + } + + ++count; + } + assert count == maxDoc; + } + + if (missing) { + long start = data.getFilePointer(); + writeMissingBitset(values); + meta.writeLong(start); + meta.writeLong(data.getFilePointer() - start); + } else { + meta.writeLong(-1L); + } + + if (uniqueValues != null) { + // small number of unique values + final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1); + FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); + if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) { + meta.writeByte(UNCOMPRESSED); // uncompressed + for (Number nv : values) { + data.writeByte(nv == null ? 0 : (byte) nv.longValue()); + } + } else { + meta.writeByte(TABLE_COMPRESSED); // table-compressed + Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]); + final HashMap encode = new HashMap(); + data.writeVInt(decode.length); + for (int i = 0; i < decode.length; i++) { + data.writeLong(decode[i]); + encode.put(decode[i], i); + } + + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeVInt(formatAndBits.format.getId()); + data.writeVInt(formatAndBits.bitsPerValue); + + final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); + for(Number nv : values) { + writer.add(encode.get(nv == null ? 0 : nv.longValue())); + } + writer.finish(); + } + } else if (gcd != 0 && gcd != 1) { + meta.writeByte(GCD_COMPRESSED); + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeLong(minValue); + data.writeLong(gcd); + data.writeVInt(BLOCK_SIZE); + + final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); + for (Number nv : values) { + long value = nv == null ? 0 : nv.longValue(); + writer.add((value - minValue) / gcd); + } + writer.finish(); + } else { + meta.writeByte(DELTA_COMPRESSED); // delta-compressed + + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeVInt(BLOCK_SIZE); + + final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); + for (Number nv : values) { + writer.add(nv == null ? 0 : nv.longValue()); + } + writer.finish(); + } + } + + @Override + public void close() throws IOException { + boolean success = false; + try { + if (meta != null) { + meta.writeVInt(-1); // write EOF marker + } + success = true; + } finally { + if (success) { + IOUtils.close(data, meta); + } else { + IOUtils.closeWhileHandlingException(data, meta); + } + } + } + + @Override + public void addBinaryField(FieldInfo field, final Iterable values) throws IOException { + // write the byte[] data + meta.writeVInt(field.number); + meta.writeByte(BYTES); + int minLength = Integer.MAX_VALUE; + int maxLength = Integer.MIN_VALUE; + final long startFP = data.getFilePointer(); + boolean missing = false; + for(BytesRef v : values) { + final int length; + if (v == null) { + length = 0; + missing = true; + } else { + length = v.length; + } + if (length > MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH) { + throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH); + } + minLength = Math.min(minLength, length); + maxLength = Math.max(maxLength, length); + if (v != null) { + data.writeBytes(v.bytes, v.offset, v.length); + } + } + meta.writeLong(startFP); + meta.writeLong(data.getFilePointer() - startFP); + if (missing) { + long start = data.getFilePointer(); + writeMissingBitset(values); + meta.writeLong(start); + meta.writeLong(data.getFilePointer() - start); + } else { + meta.writeLong(-1L); + } + meta.writeVInt(minLength); + meta.writeVInt(maxLength); + + // if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit) + // otherwise, we need to record the length fields... + if (minLength != maxLength) { + meta.writeVInt(PackedInts.VERSION_CURRENT); + meta.writeVInt(BLOCK_SIZE); + + final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE); + long addr = 0; + for (BytesRef v : values) { + if (v != null) { + addr += v.length; + } + writer.add(addr); + } + writer.finish(); + } + } + + private void writeFST(FieldInfo field, Iterable values) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(FST); + meta.writeLong(data.getFilePointer()); + PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); + Builder builder = new Builder(INPUT_TYPE.BYTE1, outputs); + IntsRef scratch = new IntsRef(); + long ord = 0; + for (BytesRef v : values) { + builder.add(Util.toIntsRef(v, scratch), ord); + ord++; + } + FST fst = builder.finish(); + if (fst != null) { + fst.save(data); + } + meta.writeVLong(ord); + } + + // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on, + // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode) + void writeMissingBitset(Iterable values) throws IOException { + long bits = 0; + int count = 0; + for (Object v : values) { + if (count == 64) { + data.writeLong(bits); + count = 0; + bits = 0; + } + if (v != null) { + bits |= 1L << (count & 0x3f); + } + count++; + } + if (count > 0) { + data.writeLong(bits); + } + } + + @Override + public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + // write the ordinals as numerics + addNumericField(field, docToOrd, false); + + // write the values as FST + writeFST(field, values); + } + + // note: this might not be the most efficient... but its fairly simple + @Override + public void addSortedSetField(FieldInfo field, Iterable values, final Iterable docToOrdCount, final Iterable ords) throws IOException { + // write the ordinals as a binary field + addBinaryField(field, new Iterable() { + @Override + public Iterator iterator() { + return new SortedSetIterator(docToOrdCount.iterator(), ords.iterator()); + } + }); + + // write the values as FST + writeFST(field, values); + } + + // per-document vint-encoded byte[] + static class SortedSetIterator implements Iterator { + byte[] buffer = new byte[10]; + ByteArrayDataOutput out = new ByteArrayDataOutput(); + BytesRef ref = new BytesRef(); + + final Iterator counts; + final Iterator ords; + + SortedSetIterator(Iterator counts, Iterator ords) { + this.counts = counts; + this.ords = ords; + } + + @Override + public boolean hasNext() { + return counts.hasNext(); + } + + @Override + public BytesRef next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + + int count = counts.next().intValue(); + int maxSize = count*9; // worst case + if (maxSize > buffer.length) { + buffer = ArrayUtil.grow(buffer, maxSize); + } + + try { + encodeValues(count); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + + ref.bytes = buffer; + ref.offset = 0; + ref.length = out.getPosition(); + + return ref; + } + + // encodes count values to buffer + private void encodeValues(int count) throws IOException { + out.reset(buffer); + long lastOrd = 0; + for (int i = 0; i < count; i++) { + long ord = ords.next().longValue(); + out.writeVLong(ord - lastOrd); + lastOrd = ord; + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesFormat.java new file mode 100644 index 00000000000..2f6216db08b --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesFormat.java @@ -0,0 +1,72 @@ +package org.apache.lucene.codecs.memory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.packed.PackedInts; + +/** In-memory docvalues format */ +public class MemoryDocValuesFormat extends DocValuesFormat { + + /** Maximum length for each binary doc values field. */ + public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2; + + final float acceptableOverheadRatio; + + /** + * Calls {@link #MemoryDocValuesFormat(float) + * MemoryDocValuesFormat(PackedInts.DEFAULT)} + */ + public MemoryDocValuesFormat() { + this(PackedInts.DEFAULT); + } + + /** + * Creates a new MemoryDocValuesFormat with the specified + * acceptableOverheadRatio for NumericDocValues. + * @param acceptableOverheadRatio compression parameter for numerics. + * Currently this is only used when the number of unique values is small. + * + * @lucene.experimental + */ + public MemoryDocValuesFormat(float acceptableOverheadRatio) { + super("Memory"); + this.acceptableOverheadRatio = acceptableOverheadRatio; + } + + @Override + public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new MemoryDocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); + } + + @Override + public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { + return new MemoryDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); + } + + static final String DATA_CODEC = "MemoryDocValuesData"; + static final String DATA_EXTENSION = "mdvd"; + static final String METADATA_CODEC = "MemoryDocValuesMetadata"; + static final String METADATA_EXTENSION = "mdvm"; +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java new file mode 100644 index 00000000000..46ed8b8e9b9 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java @@ -0,0 +1,633 @@ +package org.apache.lucene.codecs.memory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.fst.BytesRefFSTEnum; +import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.BlockPackedReader; +import org.apache.lucene.util.packed.MonotonicBlockPackedReader; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Reader for {@link MemoryDocValuesFormat} + */ +class MemoryDocValuesProducer extends DocValuesProducer { + // metadata maps (just file pointers and minimal stuff) + private final Map numerics; + private final Map binaries; + private final Map fsts; + private final IndexInput data; + + // ram instances we have already loaded + private final Map numericInstances = + new HashMap(); + private final Map binaryInstances = + new HashMap(); + private final Map> fstInstances = + new HashMap>(); + private final Map docsWithFieldInstances = new HashMap(); + + private final int maxDoc; + + + static final byte NUMBER = 0; + static final byte BYTES = 1; + static final byte FST = 2; + + static final int BLOCK_SIZE = 4096; + + static final byte DELTA_COMPRESSED = 0; + static final byte TABLE_COMPRESSED = 1; + static final byte UNCOMPRESSED = 2; + static final byte GCD_COMPRESSED = 3; + + static final int VERSION_START = 0; + static final int VERSION_GCD_COMPRESSION = 1; + static final int VERSION_CURRENT = VERSION_GCD_COMPRESSION; + + MemoryDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + maxDoc = state.segmentInfo.getDocCount(); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + // read in the entries from the metadata file. + IndexInput in = state.directory.openInput(metaName, state.context); + boolean success = false; + final int version; + try { + version = CodecUtil.checkHeader(in, metaCodec, + VERSION_START, + VERSION_CURRENT); + numerics = new HashMap(); + binaries = new HashMap(); + fsts = new HashMap(); + readFields(in, state.fieldInfos); + + success = true; + } finally { + if (success) { + IOUtils.close(in); + } else { + IOUtils.closeWhileHandlingException(in); + } + } + + success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.openInput(dataName, state.context); + final int version2 = CodecUtil.checkHeader(data, dataCodec, + VERSION_START, + VERSION_CURRENT); + if (version != version2) { + throw new CorruptIndexException("Format versions mismatch"); + } + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this.data); + } + } + } + + private void readFields(IndexInput meta, FieldInfos infos) throws IOException { + int fieldNumber = meta.readVInt(); + while (fieldNumber != -1) { + int fieldType = meta.readByte(); + if (fieldType == NUMBER) { + NumericEntry entry = new NumericEntry(); + entry.offset = meta.readLong(); + entry.missingOffset = meta.readLong(); + if (entry.missingOffset != -1) { + entry.missingBytes = meta.readLong(); + } else { + entry.missingBytes = 0; + } + entry.format = meta.readByte(); + switch(entry.format) { + case DELTA_COMPRESSED: + case TABLE_COMPRESSED: + case GCD_COMPRESSED: + case UNCOMPRESSED: + break; + default: + throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); + } + if (entry.format != UNCOMPRESSED) { + entry.packedIntsVersion = meta.readVInt(); + } + numerics.put(fieldNumber, entry); + } else if (fieldType == BYTES) { + BinaryEntry entry = new BinaryEntry(); + entry.offset = meta.readLong(); + entry.numBytes = meta.readLong(); + entry.missingOffset = meta.readLong(); + if (entry.missingOffset != -1) { + entry.missingBytes = meta.readLong(); + } else { + entry.missingBytes = 0; + } + entry.minLength = meta.readVInt(); + entry.maxLength = meta.readVInt(); + if (entry.minLength != entry.maxLength) { + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); + } + binaries.put(fieldNumber, entry); + } else if (fieldType == FST) { + FSTEntry entry = new FSTEntry(); + entry.offset = meta.readLong(); + entry.numOrds = meta.readVLong(); + fsts.put(fieldNumber, entry); + } else { + throw new CorruptIndexException("invalid entry type: " + fieldType + ", input=" + meta); + } + fieldNumber = meta.readVInt(); + } + } + + @Override + public synchronized NumericDocValues getNumeric(FieldInfo field) throws IOException { + NumericDocValues instance = numericInstances.get(field.number); + if (instance == null) { + instance = loadNumeric(field); + numericInstances.put(field.number, instance); + } + return instance; + } + + private NumericDocValues loadNumeric(FieldInfo field) throws IOException { + NumericEntry entry = numerics.get(field.number); + data.seek(entry.offset + entry.missingBytes); + switch (entry.format) { + case TABLE_COMPRESSED: + int size = data.readVInt(); + if (size > 256) { + throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + data); + } + final long decode[] = new long[size]; + for (int i = 0; i < decode.length; i++) { + decode[i] = data.readLong(); + } + final int formatID = data.readVInt(); + final int bitsPerValue = data.readVInt(); + final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), entry.packedIntsVersion, maxDoc, bitsPerValue); + return new NumericDocValues() { + @Override + public long get(int docID) { + return decode[(int)ordsReader.get(docID)]; + } + }; + case DELTA_COMPRESSED: + final int blockSize = data.readVInt(); + final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false); + return new NumericDocValues() { + @Override + public long get(int docID) { + return reader.get(docID); + } + }; + case UNCOMPRESSED: + final byte bytes[] = new byte[maxDoc]; + data.readBytes(bytes, 0, bytes.length); + return new NumericDocValues() { + @Override + public long get(int docID) { + return bytes[docID]; + } + }; + case GCD_COMPRESSED: + final long min = data.readLong(); + final long mult = data.readLong(); + final int quotientBlockSize = data.readVInt(); + final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, quotientBlockSize, maxDoc, false); + return new NumericDocValues() { + @Override + public long get(int docID) { + return min + mult * quotientReader.get(docID); + } + }; + default: + throw new AssertionError(); + } + } + + @Override + public synchronized BinaryDocValues getBinary(FieldInfo field) throws IOException { + BinaryDocValues instance = binaryInstances.get(field.number); + if (instance == null) { + instance = loadBinary(field); + binaryInstances.put(field.number, instance); + } + return instance; + } + + private BinaryDocValues loadBinary(FieldInfo field) throws IOException { + BinaryEntry entry = binaries.get(field.number); + data.seek(entry.offset); + PagedBytes bytes = new PagedBytes(16); + bytes.copy(data, entry.numBytes); + final PagedBytes.Reader bytesReader = bytes.freeze(true); + if (entry.minLength == entry.maxLength) { + final int fixedLength = entry.minLength; + return new BinaryDocValues() { + @Override + public void get(int docID, BytesRef result) { + bytesReader.fillSlice(result, fixedLength * (long)docID, fixedLength); + } + }; + } else { + data.seek(data.getFilePointer() + entry.missingBytes); + final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false); + return new BinaryDocValues() { + @Override + public void get(int docID, BytesRef result) { + long startAddress = docID == 0 ? 0 : addresses.get(docID-1); + long endAddress = addresses.get(docID); + bytesReader.fillSlice(result, startAddress, (int) (endAddress - startAddress)); + } + }; + } + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + final FSTEntry entry = fsts.get(field.number); + if (entry.numOrds == 0) { + return SortedDocValues.EMPTY; + } + FST instance; + synchronized(this) { + instance = fstInstances.get(field.number); + if (instance == null) { + data.seek(entry.offset); + instance = new FST(data, PositiveIntOutputs.getSingleton()); + fstInstances.put(field.number, instance); + } + } + final NumericDocValues docToOrd = getNumeric(field); + final FST fst = instance; + + // per-thread resources + final BytesReader in = fst.getBytesReader(); + final Arc firstArc = new Arc(); + final Arc scratchArc = new Arc(); + final IntsRef scratchInts = new IntsRef(); + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); + + return new SortedDocValues() { + @Override + public int getOrd(int docID) { + return (int) docToOrd.get(docID); + } + + @Override + public void lookupOrd(int ord, BytesRef result) { + try { + in.setPosition(0); + fst.getFirstArc(firstArc); + IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts); + result.bytes = new byte[output.length]; + result.offset = 0; + result.length = 0; + Util.toBytesRef(output, result); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + @Override + public int lookupTerm(BytesRef key) { + try { + InputOutput o = fstEnum.seekCeil(key); + if (o == null) { + return -getValueCount()-1; + } else if (o.input.equals(key)) { + return o.output.intValue(); + } else { + return (int) -o.output-1; + } + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + @Override + public int getValueCount() { + return (int)entry.numOrds; + } + + @Override + public TermsEnum termsEnum() { + return new FSTTermsEnum(fst); + } + }; + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + final FSTEntry entry = fsts.get(field.number); + if (entry.numOrds == 0) { + return SortedSetDocValues.EMPTY; // empty FST! + } + FST instance; + synchronized(this) { + instance = fstInstances.get(field.number); + if (instance == null) { + data.seek(entry.offset); + instance = new FST(data, PositiveIntOutputs.getSingleton()); + fstInstances.put(field.number, instance); + } + } + final BinaryDocValues docToOrds = getBinary(field); + final FST fst = instance; + + // per-thread resources + final BytesReader in = fst.getBytesReader(); + final Arc firstArc = new Arc(); + final Arc scratchArc = new Arc(); + final IntsRef scratchInts = new IntsRef(); + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); + final BytesRef ref = new BytesRef(); + final ByteArrayDataInput input = new ByteArrayDataInput(); + return new SortedSetDocValues() { + long currentOrd; + + @Override + public long nextOrd() { + if (input.eof()) { + return NO_MORE_ORDS; + } else { + currentOrd += input.readVLong(); + return currentOrd; + } + } + + @Override + public void setDocument(int docID) { + docToOrds.get(docID, ref); + input.reset(ref.bytes, ref.offset, ref.length); + currentOrd = 0; + } + + @Override + public void lookupOrd(long ord, BytesRef result) { + try { + in.setPosition(0); + fst.getFirstArc(firstArc); + IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts); + result.bytes = new byte[output.length]; + result.offset = 0; + result.length = 0; + Util.toBytesRef(output, result); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + @Override + public long lookupTerm(BytesRef key) { + try { + InputOutput o = fstEnum.seekCeil(key); + if (o == null) { + return -getValueCount()-1; + } else if (o.input.equals(key)) { + return o.output.intValue(); + } else { + return -o.output-1; + } + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + @Override + public long getValueCount() { + return entry.numOrds; + } + + @Override + public TermsEnum termsEnum() { + return new FSTTermsEnum(fst); + } + }; + } + + private Bits getMissingBits(int fieldNumber, final long offset, final long length) throws IOException { + if (offset == -1) { + return new Bits.MatchAllBits(maxDoc); + } else { + Bits instance; + synchronized(this) { + instance = docsWithFieldInstances.get(fieldNumber); + if (instance == null) { + IndexInput data = this.data.clone(); + data.seek(offset); + assert length % 8 == 0; + long bits[] = new long[(int) length >> 3]; + for (int i = 0; i < bits.length; i++) { + bits[i] = data.readLong(); + } + instance = new FixedBitSet(bits, maxDoc); + docsWithFieldInstances.put(fieldNumber, instance); + } + } + return instance; + } + } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + switch(field.getDocValuesType()) { + case SORTED_SET: + return new SortedSetDocsWithField(getSortedSet(field), maxDoc); + case SORTED: + return new SortedDocsWithField(getSorted(field), maxDoc); + case BINARY: + BinaryEntry be = binaries.get(field.number); + return getMissingBits(field.number, be.missingOffset, be.missingBytes); + case NUMERIC: + NumericEntry ne = numerics.get(field.number); + return getMissingBits(field.number, ne.missingOffset, ne.missingBytes); + default: + throw new AssertionError(); + } + } + + @Override + public void close() throws IOException { + data.close(); + } + + static class NumericEntry { + long offset; + long missingOffset; + long missingBytes; + byte format; + int packedIntsVersion; + } + + static class BinaryEntry { + long offset; + long missingOffset; + long missingBytes; + long numBytes; + int minLength; + int maxLength; + int packedIntsVersion; + int blockSize; + } + + static class FSTEntry { + long offset; + long numOrds; + } + + // exposes FSTEnum directly as a TermsEnum: avoids binary-search next() + static class FSTTermsEnum extends TermsEnum { + final BytesRefFSTEnum in; + + // this is all for the complicated seek(ord)... + // maybe we should add a FSTEnum that supports this operation? + final FST fst; + final FST.BytesReader bytesReader; + final Arc firstArc = new Arc(); + final Arc scratchArc = new Arc(); + final IntsRef scratchInts = new IntsRef(); + final BytesRef scratchBytes = new BytesRef(); + + FSTTermsEnum(FST fst) { + this.fst = fst; + in = new BytesRefFSTEnum(fst); + bytesReader = fst.getBytesReader(); + } + + @Override + public BytesRef next() throws IOException { + InputOutput io = in.next(); + if (io == null) { + return null; + } else { + return io.input; + } + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + if (in.seekCeil(text) == null) { + return SeekStatus.END; + } else if (term().equals(text)) { + // TODO: add SeekStatus to FSTEnum like in https://issues.apache.org/jira/browse/LUCENE-3729 + // to remove this comparision? + return SeekStatus.FOUND; + } else { + return SeekStatus.NOT_FOUND; + } + } + + @Override + public boolean seekExact(BytesRef text) throws IOException { + if (in.seekExact(text) == null) { + return false; + } else { + return true; + } + } + + @Override + public void seekExact(long ord) throws IOException { + // TODO: would be better to make this simpler and faster. + // but we dont want to introduce a bug that corrupts our enum state! + bytesReader.setPosition(0); + fst.getFirstArc(firstArc); + IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); + scratchBytes.bytes = new byte[output.length]; + scratchBytes.offset = 0; + scratchBytes.length = 0; + Util.toBytesRef(output, scratchBytes); + // TODO: we could do this lazily, better to try to push into FSTEnum though? + in.seekExact(scratchBytes); + } + + @Override + public BytesRef term() throws IOException { + return in.current().input; + } + + @Override + public long ord() throws IOException { + return in.current().output; + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/package.html b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/package.html index 340e8316908..468cc07257e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/package.html +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/package.html @@ -20,6 +20,6 @@ -Postings format that is read entirely into memory. +Postings and DocValues formats that are read entirely into memory. \ No newline at end of file diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java index 02557c95b57..c256367f23f 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java @@ -38,12 +38,16 @@ import org.apache.lucene.index.SegmentWriteState; * minvalue 0 * pattern 000 * 005 + * T * 234 + * T * 123 + * T * ... * * so a document's value (delta encoded from minvalue) can be retrieved by - * seeking to startOffset + (1+pattern.length())*docid. The extra 1 is the newline. + * seeking to startOffset + (1+pattern.length()+2)*docid. The extra 1 is the newline. + * The extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing. * * for bytes this is also a "fixed-width" file, for example: *
@@ -53,12 +57,15 @@ import org.apache.lucene.index.SegmentWriteState;
  *    pattern 0
  *  length 6
  *  foobar[space][space]
+ *  T
  *  length 3
  *  baz[space][space][space][space][space]
+ *  T
  *  ...
  *  
- * so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*doc + * so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc * the extra 9 is 2 newlines, plus "length " itself. + * the extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing. * * for sorted bytes this is a fixed-width file, for example: *
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
index db5ec4e09f3..3753a62193c 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
@@ -38,6 +38,7 @@ import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.FieldInfo.DocValuesType;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.StringHelper;
 
@@ -100,7 +101,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
         assert startsWith(PATTERN);
         field.pattern = stripPrefix(PATTERN);
         field.dataStartFilePointer = data.getFilePointer();
-        data.seek(data.getFilePointer() + (1+field.pattern.length()) * maxDoc);
+        data.seek(data.getFilePointer() + (1+field.pattern.length()+2) * maxDoc);
       } else if (dvType == DocValuesType.BINARY) {
         readLine();
         assert startsWith(MAXLENGTH);
@@ -109,7 +110,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
         assert startsWith(PATTERN);
         field.pattern = stripPrefix(PATTERN);
         field.dataStartFilePointer = data.getFilePointer();
-        data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength) * maxDoc);
+        data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength+2) * maxDoc);
       } else if (dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) {
         readLine();
         assert startsWith(NUMVALUES);
@@ -158,7 +159,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
           if (docID < 0 || docID >= maxDoc) {
             throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
           }
-          in.seek(field.dataStartFilePointer + (1+field.pattern.length())*docID);
+          in.seek(field.dataStartFilePointer + (1+field.pattern.length()+2)*docID);
           SimpleTextUtil.readLine(in, scratch);
           //System.out.println("parsing delta: " + scratch.utf8ToString());
           BigDecimal bd;
@@ -169,6 +170,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
             e.initCause(pe);
             throw e;
           }
+          SimpleTextUtil.readLine(in, scratch); // read the line telling us if its real or not
           return BigInteger.valueOf(field.minValue).add(bd.toBigIntegerExact()).longValue();
         } catch (IOException ioe) {
           throw new RuntimeException(ioe);
@@ -176,6 +178,30 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
       }
     };
   }
+  
+  private Bits getNumericDocsWithField(FieldInfo fieldInfo) throws IOException {
+    final OneField field = fields.get(fieldInfo.name);
+    final IndexInput in = data.clone();
+    final BytesRef scratch = new BytesRef();
+    return new Bits() {
+      @Override
+      public boolean get(int index) {
+        try {
+          in.seek(field.dataStartFilePointer + (1+field.pattern.length()+2)*index);
+          SimpleTextUtil.readLine(in, scratch); // data
+          SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
+          return scratch.bytes[scratch.offset] == (byte) 'T';
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      }
+
+      @Override
+      public int length() {
+        return maxDoc;
+      }
+    };
+  }
 
   @Override
   public BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOException {
@@ -196,7 +222,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
           if (docID < 0 || docID >= maxDoc) {
             throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
           }
-          in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength)*docID);
+          in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength+2)*docID);
           SimpleTextUtil.readLine(in, scratch);
           assert StringHelper.startsWith(scratch, LENGTH);
           int len;
@@ -217,6 +243,45 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
       }
     };
   }
+  
+  private Bits getBinaryDocsWithField(FieldInfo fieldInfo) throws IOException {
+    final OneField field = fields.get(fieldInfo.name);
+    final IndexInput in = data.clone();
+    final BytesRef scratch = new BytesRef();
+    final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
+
+    return new Bits() {
+      @Override
+      public boolean get(int index) {
+        try {
+          in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength+2)*index);
+          SimpleTextUtil.readLine(in, scratch);
+          assert StringHelper.startsWith(scratch, LENGTH);
+          int len;
+          try {
+            len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
+          } catch (ParseException pe) {
+            CorruptIndexException e = new CorruptIndexException("failed to parse int length (resource=" + in + ")");
+            e.initCause(pe);
+            throw e;
+          }
+          // skip past bytes
+          byte bytes[] = new byte[len];
+          in.readBytes(bytes, 0, len);
+          SimpleTextUtil.readLine(in, scratch); // newline
+          SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
+          return scratch.bytes[scratch.offset] == (byte) 'T';
+        } catch (IOException ioe) {
+          throw new RuntimeException(ioe);
+        }
+      }
+
+      @Override
+      public int length() {
+        return maxDoc;
+      }
+    };
+  }
 
   @Override
   public SortedDocValues getSorted(FieldInfo fieldInfo) throws IOException {
@@ -241,7 +306,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
           in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + docID * (1 + field.ordPattern.length()));
           SimpleTextUtil.readLine(in, scratch);
           try {
-            return ordDecoder.parse(scratch.utf8ToString()).intValue();
+            return (int) ordDecoder.parse(scratch.utf8ToString()).longValue()-1;
           } catch (ParseException pe) {
             CorruptIndexException e = new CorruptIndexException("failed to parse ord (resource=" + in + ")");
             e.initCause(pe);
@@ -362,6 +427,22 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
       }
     };
   }
+  
+  @Override
+  public Bits getDocsWithField(FieldInfo field) throws IOException {
+    switch (field.getDocValuesType()) {
+      case SORTED_SET:
+        return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
+      case SORTED:
+        return new SortedDocsWithField(getSorted(field), maxDoc);
+      case BINARY:
+        return getBinaryDocsWithField(field);
+      case NUMERIC:
+        return getNumericDocsWithField(field);
+      default:
+        throw new AssertionError();
+    }
+  }
 
   @Override
   public void close() throws IOException {
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
index 2f86255cbd6..e5f1e35007e 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
@@ -78,7 +78,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
     long minValue = Long.MAX_VALUE;
     long maxValue = Long.MIN_VALUE;
     for(Number n : values) {
-      long v = n.longValue();
+      long v = n == null ? 0 : n.longValue();
       minValue = Math.min(minValue, v);
       maxValue = Math.max(maxValue, v);
     }
@@ -112,13 +112,19 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
 
     // second pass to write the values
     for(Number n : values) {
-      long value = n.longValue();
+      long value = n == null ? 0 : n.longValue();
       assert value >= minValue;
       Number delta = BigInteger.valueOf(value).subtract(BigInteger.valueOf(minValue));
       String s = encoder.format(delta);
       assert s.length() == patternString.length();
       SimpleTextUtil.write(data, s, scratch);
       SimpleTextUtil.writeNewline(data);
+      if (n == null) {
+        SimpleTextUtil.write(data, "F", scratch);
+      } else {
+        SimpleTextUtil.write(data, "T", scratch);
+      }
+      SimpleTextUtil.writeNewline(data);
       numDocsWritten++;
       assert numDocsWritten <= numDocs;
     }
@@ -132,7 +138,8 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
     assert field.getDocValuesType() == DocValuesType.BINARY;
     int maxLength = 0;
     for(BytesRef value : values) {
-      maxLength = Math.max(maxLength, value.length);
+      final int length = value == null ? 0 : value.length;
+      maxLength = Math.max(maxLength, length);
     }
     writeFieldEntry(field, FieldInfo.DocValuesType.BINARY);
 
@@ -155,19 +162,28 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
     int numDocsWritten = 0;
     for(BytesRef value : values) {
       // write length
+      final int length = value == null ? 0 : value.length;
       SimpleTextUtil.write(data, LENGTH);
-      SimpleTextUtil.write(data, encoder.format(value.length), scratch);
+      SimpleTextUtil.write(data, encoder.format(length), scratch);
       SimpleTextUtil.writeNewline(data);
         
       // write bytes -- don't use SimpleText.write
       // because it escapes:
-      data.writeBytes(value.bytes, value.offset, value.length);
+      if (value != null) {
+        data.writeBytes(value.bytes, value.offset, value.length);
+      }
 
       // pad to fit
-      for (int i = value.length; i < maxLength; i++) {
+      for (int i = length; i < maxLength; i++) {
         data.writeByte((byte)' ');
       }
       SimpleTextUtil.writeNewline(data);
+      if (value == null) {
+        SimpleTextUtil.write(data, "F", scratch);
+      } else {
+        SimpleTextUtil.write(data, "T", scratch);
+      }
+      SimpleTextUtil.writeNewline(data);
       numDocsWritten++;
     }
 
@@ -209,7 +225,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
     SimpleTextUtil.writeNewline(data);
     final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
     
-    int maxOrdBytes = Integer.toString(valueCount).length();
+    int maxOrdBytes = Long.toString(valueCount+1L).length();
     sb.setLength(0);
     for (int i = 0; i < maxOrdBytes; i++) {
       sb.append('0');
@@ -246,7 +262,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
     assert valuesSeen == valueCount;
 
     for(Number ord : docToOrd) {
-      SimpleTextUtil.write(data, ordEncoder.format(ord.intValue()), scratch);
+      SimpleTextUtil.write(data, ordEncoder.format(ord.longValue()+1), scratch);
       SimpleTextUtil.writeNewline(data);
     }
   }
diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
index e289c4d0966..5103c529355 100644
--- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
+++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
@@ -14,4 +14,5 @@
 #  limitations under the License.
 
 org.apache.lucene.codecs.diskdv.DiskDocValuesFormat
+org.apache.lucene.codecs.memory.MemoryDocValuesFormat
 org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat
diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestMemoryDocValuesFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestMemoryDocValuesFormat.java
new file mode 100644
index 00000000000..77c6ea582a8
--- /dev/null
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestMemoryDocValuesFormat.java
@@ -0,0 +1,39 @@
+package org.apache.lucene.codecs.memory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase;
+import org.apache.lucene.util._TestUtil;
+
+/**
+ * Tests MemoryDocValuesFormat
+ */
+public class TestMemoryDocValuesFormat extends BaseCompressingDocValuesFormatTestCase {
+  private final Codec codec = _TestUtil.alwaysDocValuesFormat(new MemoryDocValuesFormat());
+
+  @Override
+  protected Codec getCodec() {
+    return codec;
+  }
+
+  @Override
+  protected boolean codecAcceptsHugeBinaryValues(String field) {
+    return false;
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
index 1b2726f06fd..3b98449eb5c 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
@@ -119,7 +119,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI {
     loader.reload(classloader);
   }
   
-  private static Codec defaultCodec = Codec.forName("Lucene42");
+  private static Codec defaultCodec = Codec.forName("Lucene45");
   
   /** expert: returns the default codec used for newly created
    *  {@link IndexWriterConfig}s.
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
index 921b94dff69..e0aac6d5cbe 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
@@ -69,7 +69,8 @@ public abstract class DocValuesConsumer implements Closeable {
   /**
    * Writes numeric docvalues for a field.
    * @param field field information
-   * @param values Iterable of numeric values (one for each document).
+   * @param values Iterable of numeric values (one for each document). {@code null} indicates
+   *               a missing value.
    * @throws IOException if an I/O error occurred.
    */
   public abstract void addNumericField(FieldInfo field, Iterable values) throws IOException;    
@@ -77,7 +78,8 @@ public abstract class DocValuesConsumer implements Closeable {
   /**
    * Writes binary docvalues for a field.
    * @param field field information
-   * @param values Iterable of binary values (one for each document).
+   * @param values Iterable of binary values (one for each document). {@code null} indicates
+   *               a missing value.
    * @throws IOException if an I/O error occurred.
    */
   public abstract void addBinaryField(FieldInfo field, Iterable values) throws IOException;
@@ -86,7 +88,8 @@ public abstract class DocValuesConsumer implements Closeable {
    * Writes pre-sorted binary docvalues for a field.
    * @param field field information
    * @param values Iterable of binary values in sorted order (deduplicated).
-   * @param docToOrd Iterable of ordinals (one for each document).
+   * @param docToOrd Iterable of ordinals (one for each document). {@code -1} indicates
+   *                 a missing value.
    * @throws IOException if an I/O error occurred.
    */
   public abstract void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException;
@@ -95,7 +98,8 @@ public abstract class DocValuesConsumer implements Closeable {
    * Writes pre-sorted set docvalues for a field
    * @param field field information
    * @param values Iterable of binary values in sorted order (deduplicated).
-   * @param docToOrdCount Iterable of the number of values for each document. 
+   * @param docToOrdCount Iterable of the number of values for each document. A zero ordinal
+   *                      count indicates a missing value.
    * @param ords Iterable of ordinal occurrences (docToOrdCount*maxDoc total).
    * @throws IOException if an I/O error occurred.
    */
@@ -107,7 +111,7 @@ public abstract class DocValuesConsumer implements Closeable {
    * The default implementation calls {@link #addNumericField}, passing
    * an Iterable that merges and filters deleted documents on the fly.
    */
-  public void mergeNumericField(FieldInfo fieldInfo, final MergeState mergeState, final List toMerge) throws IOException {
+  public void mergeNumericField(final FieldInfo fieldInfo, final MergeState mergeState, final List toMerge, final List docsWithField) throws IOException {
 
     addNumericField(fieldInfo,
                     new Iterable() {
@@ -116,10 +120,11 @@ public abstract class DocValuesConsumer implements Closeable {
                         return new Iterator() {
                           int readerUpto = -1;
                           int docIDUpto;
-                          long nextValue;
+                          Long nextValue;
                           AtomicReader currentReader;
                           NumericDocValues currentValues;
                           Bits currentLiveDocs;
+                          Bits currentDocsWithField;
                           boolean nextIsSet;
 
                           @Override
@@ -139,7 +144,6 @@ public abstract class DocValuesConsumer implements Closeable {
                             }
                             assert nextIsSet;
                             nextIsSet = false;
-                            // TODO: make a mutable number
                             return nextValue;
                           }
 
@@ -155,6 +159,7 @@ public abstract class DocValuesConsumer implements Closeable {
                                   currentReader = mergeState.readers.get(readerUpto);
                                   currentValues = toMerge.get(readerUpto);
                                   currentLiveDocs = currentReader.getLiveDocs();
+                                  currentDocsWithField = docsWithField.get(readerUpto);
                                 }
                                 docIDUpto = 0;
                                 continue;
@@ -162,7 +167,11 @@ public abstract class DocValuesConsumer implements Closeable {
 
                               if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
                                 nextIsSet = true;
-                                nextValue = currentValues.get(docIDUpto);
+                                if (currentDocsWithField.get(docIDUpto)) {
+                                  nextValue = currentValues.get(docIDUpto);
+                                } else {
+                                  nextValue = null;
+                                }
                                 docIDUpto++;
                                 return true;
                               }
@@ -181,7 +190,7 @@ public abstract class DocValuesConsumer implements Closeable {
    * The default implementation calls {@link #addBinaryField}, passing
    * an Iterable that merges and filters deleted documents on the fly.
    */
-  public void mergeBinaryField(FieldInfo fieldInfo, final MergeState mergeState, final List toMerge) throws IOException {
+  public void mergeBinaryField(FieldInfo fieldInfo, final MergeState mergeState, final List toMerge, final List docsWithField) throws IOException {
 
     addBinaryField(fieldInfo,
                    new Iterable() {
@@ -191,9 +200,11 @@ public abstract class DocValuesConsumer implements Closeable {
                          int readerUpto = -1;
                          int docIDUpto;
                          BytesRef nextValue = new BytesRef();
+                         BytesRef nextPointer; // points to null if missing, or nextValue
                          AtomicReader currentReader;
                          BinaryDocValues currentValues;
                          Bits currentLiveDocs;
+                         Bits currentDocsWithField;
                          boolean nextIsSet;
 
                          @Override
@@ -213,8 +224,7 @@ public abstract class DocValuesConsumer implements Closeable {
                            }
                            assert nextIsSet;
                            nextIsSet = false;
-                           // TODO: make a mutable number
-                           return nextValue;
+                           return nextPointer;
                          }
 
                          private boolean setNext() {
@@ -228,6 +238,7 @@ public abstract class DocValuesConsumer implements Closeable {
                                if (readerUpto < toMerge.size()) {
                                  currentReader = mergeState.readers.get(readerUpto);
                                  currentValues = toMerge.get(readerUpto);
+                                 currentDocsWithField = docsWithField.get(readerUpto);
                                  currentLiveDocs = currentReader.getLiveDocs();
                                }
                                docIDUpto = 0;
@@ -236,7 +247,12 @@ public abstract class DocValuesConsumer implements Closeable {
 
                              if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
                                nextIsSet = true;
-                               currentValues.get(docIDUpto, nextValue);
+                               if (currentDocsWithField.get(docIDUpto)) {
+                                 currentValues.get(docIDUpto, nextValue);
+                                 nextPointer = nextValue;
+                               } else {
+                                 nextPointer = null;
+                               }
                                docIDUpto++;
                                return true;
                              }
@@ -272,7 +288,10 @@ public abstract class DocValuesConsumer implements Closeable {
         OpenBitSet bitset = new OpenBitSet(dv.getValueCount());
         for (int i = 0; i < reader.maxDoc(); i++) {
           if (liveDocs.get(i)) {
-            bitset.set(dv.getOrd(i));
+            int ord = dv.getOrd(i);
+            if (ord >= 0) {
+              bitset.set(ord);
+            }
           }
         }
         liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
@@ -368,7 +387,7 @@ public abstract class DocValuesConsumer implements Closeable {
                   if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
                     nextIsSet = true;
                     int segOrd = dvs[readerUpto].getOrd(docIDUpto);
-                    nextValue = (int) map.getGlobalOrd(readerUpto, segOrd);
+                    nextValue = segOrd == -1 ? -1 : (int) map.getGlobalOrd(readerUpto, segOrd);
                     docIDUpto++;
                     return true;
                   }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
index b2c5d549d27..05dfcf1b0dd 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
@@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.util.Bits;
 
 /** Abstract API that produces numeric, binary and
  * sorted docvalues.
@@ -56,4 +57,67 @@ public abstract class DocValuesProducer implements Closeable {
    *  The returned instance need not be thread-safe: it will only be
    *  used by a single thread. */
   public abstract SortedSetDocValues getSortedSet(FieldInfo field) throws IOException;
+  
+  /** Returns a {@link Bits} at the size of reader.maxDoc(), 
+   *  with turned on bits for each docid that does have a value for this field.
+   *  The returned instance need not be thread-safe: it will only be
+   *  used by a single thread. */
+  public abstract Bits getDocsWithField(FieldInfo field) throws IOException;
+  
+  /** 
+   * A simple implementation of {@link DocValuesProducer#getDocsWithField} that 
+   * returns {@code true} if a document has an ordinal >= 0
+   * 

+ * Codecs can choose to use this (or implement it more efficiently another way), but + * in most cases a Bits is unnecessary anyway: users can check this as they go. + */ + public static class SortedDocsWithField implements Bits { + final SortedDocValues in; + final int maxDoc; + + /** Creates a {@link Bits} returning true if the document has a value */ + public SortedDocsWithField(SortedDocValues in, int maxDoc) { + this.in = in; + this.maxDoc = maxDoc; + } + + @Override + public boolean get(int index) { + return in.getOrd(index) >= 0; + } + + @Override + public int length() { + return maxDoc; + } + } + + /** + * A simple implementation of {@link DocValuesProducer#getDocsWithField} that + * returns {@code true} if a document has any ordinals. + *

+ * Codecs can choose to use this (or implement it more efficiently another way), but + * in most cases a Bits is unnecessary anyway: users can check this as they go. + */ + public static class SortedSetDocsWithField implements Bits { + final SortedSetDocValues in; + final int maxDoc; + + /** Creates a {@link Bits} returning true if the document has a value */ + public SortedSetDocsWithField(SortedSetDocValues in, int maxDoc) { + this.in = in; + this.maxDoc = maxDoc; + } + + @Override + public boolean get(int index) { + in.setDocument(index); + return in.nextOrd() != SortedSetDocValues.NO_MORE_ORDS; + } + + @Override + public int length() { + return maxDoc; + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java b/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java index fc4728ae846..d97d577de22 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java @@ -21,13 +21,13 @@ package org.apache.lucene.codecs; * A codec that forwards all its method calls to another codec. *

* Extend this class when you need to reuse the functionality of an existing - * codec. For example, if you want to build a codec that redefines Lucene42's + * codec. For example, if you want to build a codec that redefines Lucene45's * {@link LiveDocsFormat}: *

  *   public final class CustomCodec extends FilterCodec {
  *
  *     public CustomCodec() {
- *       super("CustomCodec", new Lucene42Codec());
+ *       super("CustomCodec", new Lucene45Codec());
  *     }
  *
  *     public LiveDocsFormat liveDocsFormat() {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java
index c7911f47dac..0e7ac44d02c 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java
@@ -27,7 +27,6 @@ import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.NormsFormat;
 import org.apache.lucene.codecs.StoredFieldsFormat;
 import org.apache.lucene.codecs.TermVectorsFormat;
-import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
 
 /**
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java
index 21a082c1893..54617702c07 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java
@@ -35,6 +35,7 @@ import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.store.CompoundFileDirectory;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.PagedBytes;
@@ -620,6 +621,11 @@ final class Lucene40DocValuesReader extends DocValuesProducer {
     throw new IllegalStateException("Lucene 4.0 does not support SortedSet: how did you pull this off?");
   }
 
+  @Override
+  public Bits getDocsWithField(FieldInfo field) throws IOException {
+    return new Bits.MatchAllBits(state.segmentInfo.getDocCount());
+  }
+
   @Override
   public void close() throws IOException {
     dir.close();
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java
index 4ec3bd833a4..0ce97ba76e6 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java
@@ -17,7 +17,10 @@ package org.apache.lucene.codecs.lucene42;
  * limitations under the License.
  */
 
+import java.io.IOException;
+
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.DocValuesConsumer;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.FieldInfosFormat;
 import org.apache.lucene.codecs.FilterCodec;
@@ -32,6 +35,7 @@ import org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat;
 import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat;
 import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
+import org.apache.lucene.index.SegmentWriteState;
 
 /**
  * Implements the Lucene 4.2 index format, with configurable per-field postings
@@ -42,10 +46,12 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
  *
  * @see org.apache.lucene.codecs.lucene42 package documentation for file format details.
  * @lucene.experimental
+ * @deprecated Only for reading old 4.2 segments
  */
 // NOTE: if we make largish changes in a minor release, easier to just make Lucene43Codec or whatever
 // if they are backwards compatible or smallish we can probably do the backwards in the postingsreader
 // (it writes a minor version, etc).
+@Deprecated
 public class Lucene42Codec extends Codec {
   private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat();
   private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat();
@@ -129,10 +135,15 @@ public class Lucene42Codec extends Codec {
   private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41");
   private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene42");
 
-  private final NormsFormat normsFormat = new Lucene42NormsFormat();
+  private final NormsFormat normsFormat = new Lucene42NormsFormat() {
+    @Override
+    public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException {
+      throw new UnsupportedOperationException("this codec can only be used for reading");
+    }
+  };
 
   @Override
-  public final NormsFormat normsFormat() {
+  public NormsFormat normsFormat() {
     return normsFormat;
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java
index 55bf8097561..00f18606a49 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java
@@ -123,8 +123,10 @@ import org.apache.lucene.util.packed.BlockPackedWriter;
  * 
    *
  • Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length. *
+ * @deprecated Only for reading old 4.2 segments */ -public final class Lucene42DocValuesFormat extends DocValuesFormat { +@Deprecated +public class Lucene42DocValuesFormat extends DocValuesFormat { /** Maximum length for each binary doc values field. */ public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2; @@ -154,8 +156,7 @@ public final class Lucene42DocValuesFormat extends DocValuesFormat { @Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - // note: we choose DEFAULT here (its reasonably fast, and for small bpv has tiny waste) - return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); + throw new UnsupportedOperationException("this codec can only be used for reading"); } @Override @@ -163,8 +164,8 @@ public final class Lucene42DocValuesFormat extends DocValuesFormat { return new Lucene42DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); } - private static final String DATA_CODEC = "Lucene42DocValuesData"; - private static final String DATA_EXTENSION = "dvd"; - private static final String METADATA_CODEC = "Lucene42DocValuesMetadata"; - private static final String METADATA_EXTENSION = "dvm"; + static final String DATA_CODEC = "Lucene42DocValuesData"; + static final String DATA_EXTENSION = "dvd"; + static final String METADATA_CODEC = "Lucene42DocValuesMetadata"; + static final String METADATA_EXTENSION = "dvm"; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java index 16ecf187b93..7b111c53e5f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java @@ -17,11 +17,6 @@ package org.apache.lucene.codecs.lucene42; * limitations under the License. */ -import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.DELTA_COMPRESSED; -import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.GCD_COMPRESSED; -import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.TABLE_COMPRESSED; -import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.UNCOMPRESSED; - import java.io.IOException; import java.util.Comparator; import java.util.HashMap; @@ -78,6 +73,22 @@ class Lucene42DocValuesProducer extends DocValuesProducer { new HashMap>(); private final int maxDoc; + + + static final byte NUMBER = 0; + static final byte BYTES = 1; + static final byte FST = 2; + + static final int BLOCK_SIZE = 4096; + + static final byte DELTA_COMPRESSED = 0; + static final byte TABLE_COMPRESSED = 1; + static final byte UNCOMPRESSED = 2; + static final byte GCD_COMPRESSED = 3; + + static final int VERSION_START = 0; + static final int VERSION_GCD_COMPRESSION = 1; + static final int VERSION_CURRENT = VERSION_GCD_COMPRESSION; Lucene42DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { maxDoc = state.segmentInfo.getDocCount(); @@ -88,8 +99,8 @@ class Lucene42DocValuesProducer extends DocValuesProducer { final int version; try { version = CodecUtil.checkHeader(in, metaCodec, - Lucene42DocValuesConsumer.VERSION_START, - Lucene42DocValuesConsumer.VERSION_CURRENT); + VERSION_START, + VERSION_CURRENT); numerics = new HashMap(); binaries = new HashMap(); fsts = new HashMap(); @@ -109,8 +120,8 @@ class Lucene42DocValuesProducer extends DocValuesProducer { String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); data = state.directory.openInput(dataName, state.context); final int version2 = CodecUtil.checkHeader(data, dataCodec, - Lucene42DocValuesConsumer.VERSION_START, - Lucene42DocValuesConsumer.VERSION_CURRENT); + VERSION_START, + VERSION_CURRENT); if (version != version2) { throw new CorruptIndexException("Format versions mismatch"); } @@ -127,7 +138,7 @@ class Lucene42DocValuesProducer extends DocValuesProducer { int fieldNumber = meta.readVInt(); while (fieldNumber != -1) { int fieldType = meta.readByte(); - if (fieldType == Lucene42DocValuesConsumer.NUMBER) { + if (fieldType == NUMBER) { NumericEntry entry = new NumericEntry(); entry.offset = meta.readLong(); entry.format = meta.readByte(); @@ -140,11 +151,11 @@ class Lucene42DocValuesProducer extends DocValuesProducer { default: throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); } - if (entry.format != Lucene42DocValuesConsumer.UNCOMPRESSED) { + if (entry.format != UNCOMPRESSED) { entry.packedIntsVersion = meta.readVInt(); } numerics.put(fieldNumber, entry); - } else if (fieldType == Lucene42DocValuesConsumer.BYTES) { + } else if (fieldType == BYTES) { BinaryEntry entry = new BinaryEntry(); entry.offset = meta.readLong(); entry.numBytes = meta.readLong(); @@ -155,7 +166,7 @@ class Lucene42DocValuesProducer extends DocValuesProducer { entry.blockSize = meta.readVInt(); } binaries.put(fieldNumber, entry); - } else if (fieldType == Lucene42DocValuesConsumer.FST) { + } else if (fieldType == FST) { FSTEntry entry = new FSTEntry(); entry.offset = meta.readLong(); entry.numOrds = meta.readVLong(); @@ -429,6 +440,15 @@ class Lucene42DocValuesProducer extends DocValuesProducer { } }; } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) { + return new SortedSetDocsWithField(getSortedSet(field), maxDoc); + } else { + return new Bits.MatchAllBits(maxDoc); + } + } @Override public void close() throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsConsumer.java new file mode 100644 index 00000000000..797dd807992 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsConsumer.java @@ -0,0 +1,209 @@ +package org.apache.lucene.codecs.lucene42; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.MathUtil; +import org.apache.lucene.util.packed.BlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts.FormatAndBits; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Writer for {@link Lucene42NormsFormat} + */ +class Lucene42NormsConsumer extends DocValuesConsumer { + static final int VERSION_START = 0; + static final int VERSION_GCD_COMPRESSION = 1; + static final int VERSION_CURRENT = VERSION_GCD_COMPRESSION; + + static final byte NUMBER = 0; + + static final int BLOCK_SIZE = 4096; + + static final byte DELTA_COMPRESSED = 0; + static final byte TABLE_COMPRESSED = 1; + static final byte UNCOMPRESSED = 2; + static final byte GCD_COMPRESSED = 3; + + final IndexOutput data, meta; + final int maxDoc; + final float acceptableOverheadRatio; + + Lucene42NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension, float acceptableOverheadRatio) throws IOException { + this.acceptableOverheadRatio = acceptableOverheadRatio; + maxDoc = state.segmentInfo.getDocCount(); + boolean success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.createOutput(dataName, state.context); + CodecUtil.writeHeader(data, dataCodec, VERSION_CURRENT); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + meta = state.directory.createOutput(metaName, state.context); + CodecUtil.writeHeader(meta, metaCodec, VERSION_CURRENT); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void addNumericField(FieldInfo field, Iterable values) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(NUMBER); + meta.writeLong(data.getFilePointer()); + long minValue = Long.MAX_VALUE; + long maxValue = Long.MIN_VALUE; + long gcd = 0; + // TODO: more efficient? + HashSet uniqueValues = null; + if (true) { + uniqueValues = new HashSet<>(); + + long count = 0; + for (Number nv : values) { + assert nv != null; + final long v = nv.longValue(); + + if (gcd != 1) { + if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { + // in that case v - minValue might overflow and make the GCD computation return + // wrong results. Since these extreme values are unlikely, we just discard + // GCD computation for them + gcd = 1; + } else if (count != 0) { // minValue needs to be set first + gcd = MathUtil.gcd(gcd, v - minValue); + } + } + + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + + if (uniqueValues != null) { + if (uniqueValues.add(v)) { + if (uniqueValues.size() > 256) { + uniqueValues = null; + } + } + } + + ++count; + } + assert count == maxDoc; + } + + if (uniqueValues != null) { + // small number of unique values + final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1); + FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); + if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) { + meta.writeByte(UNCOMPRESSED); // uncompressed + for (Number nv : values) { + data.writeByte(nv == null ? 0 : (byte) nv.longValue()); + } + } else { + meta.writeByte(TABLE_COMPRESSED); // table-compressed + Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]); + final HashMap encode = new HashMap(); + data.writeVInt(decode.length); + for (int i = 0; i < decode.length; i++) { + data.writeLong(decode[i]); + encode.put(decode[i], i); + } + + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeVInt(formatAndBits.format.getId()); + data.writeVInt(formatAndBits.bitsPerValue); + + final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); + for(Number nv : values) { + writer.add(encode.get(nv == null ? 0 : nv.longValue())); + } + writer.finish(); + } + } else if (gcd != 0 && gcd != 1) { + meta.writeByte(GCD_COMPRESSED); + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeLong(minValue); + data.writeLong(gcd); + data.writeVInt(BLOCK_SIZE); + + final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); + for (Number nv : values) { + long value = nv == null ? 0 : nv.longValue(); + writer.add((value - minValue) / gcd); + } + writer.finish(); + } else { + meta.writeByte(DELTA_COMPRESSED); // delta-compressed + + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeVInt(BLOCK_SIZE); + + final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); + for (Number nv : values) { + writer.add(nv == null ? 0 : nv.longValue()); + } + writer.finish(); + } + } + + @Override + public void close() throws IOException { + boolean success = false; + try { + if (meta != null) { + meta.writeVInt(-1); // write EOF marker + } + success = true; + } finally { + if (success) { + IOUtils.close(data, meta); + } else { + IOUtils.closeWhileHandlingException(data, meta); + } + } + } + + @Override + public void addBinaryField(FieldInfo field, final Iterable values) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void addSortedSetField(FieldInfo field, Iterable values, final Iterable docToOrdCount, final Iterable ords) throws IOException { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java index a7c8c1a2aa7..a4571ec6237 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java @@ -41,7 +41,7 @@ import org.apache.lucene.util.packed.PackedInts; * * @see Lucene42DocValuesFormat */ -public final class Lucene42NormsFormat extends NormsFormat { +public class Lucene42NormsFormat extends NormsFormat { final float acceptableOverheadRatio; /** @@ -67,7 +67,7 @@ public final class Lucene42NormsFormat extends NormsFormat { @Override public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException { - return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); + return new Lucene42NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html index 571b7668c41..ae55e7a63de 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html @@ -178,7 +178,7 @@ For each field in each document, a value is stored that is multiplied into the score for hits on that field.
  • -{@link org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat Term Vectors}. +{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vectors}. For each field in each document, the term vector (sometimes called document vector) may be stored. A term vector consists of term text and term frequency. To add Term Vectors to your index see the @@ -299,17 +299,17 @@ systems that frequently run out of file handles. Encodes additional scoring factors or other per-document information. -{@link org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat Term Vector Index} +{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Index} .tvx Stores offset into the document data file -{@link org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat Term Vector Documents} +{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Documents} .tvd Contains information about each document that has term vectors -{@link org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat Term Vector Fields} +{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Fields} .tvf The field level info about term vectors diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45Codec.java new file mode 100644 index 00000000000..5ec25e1003c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45Codec.java @@ -0,0 +1,141 @@ +package org.apache.lucene.codecs.lucene45; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat; +import org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat; +import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat; +import org.apache.lucene.codecs.lucene42.Lucene42FieldInfosFormat; +import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; +import org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * Implements the Lucene 4.5 index format, with configurable per-field postings + * and docvalues formats. + *

    + * If you want to reuse functionality of this codec in another codec, extend + * {@link FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene45 package documentation for file format details. + * @lucene.experimental + */ +// NOTE: if we make largish changes in a minor release, easier to just make Lucene46Codec or whatever +// if they are backwards compatible or smallish we can probably do the backwards in the postingsreader +// (it writes a minor version, etc). +public class Lucene45Codec extends Codec { + private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat(); + private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene42FieldInfosFormat(); + private final SegmentInfoFormat infosFormat = new Lucene40SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene40LiveDocsFormat(); + + private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene45Codec.this.getPostingsFormatForField(field); + } + }; + + + private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene45Codec.this.getDocValuesFormatForField(field); + } + }; + + /** Sole constructor. */ + public Lucene45Codec() { + super("Lucene45"); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return fieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return infosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + /** Returns the postings format that should be used for writing + * new segments of field. + * + * The default implementation always returns "Lucene41" + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultFormat; + } + + /** Returns the docvalues format that should be used for writing + * new segments of field. + * + * The default implementation always returns "Lucene45" + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41"); + private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene45"); + + private final NormsFormat normsFormat = new Lucene42NormsFormat(); + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java similarity index 80% rename from lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java index 2d4853a66b0..e5afdf70abb 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.diskdv; +package org.apache.lucene.codecs.lucene45; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,6 +17,7 @@ package org.apache.lucene.codecs.diskdv; * limitations under the License. */ +import java.io.Closeable; // javadocs import java.io.IOException; import java.util.HashMap; import java.util.HashSet; @@ -36,8 +37,8 @@ import org.apache.lucene.util.packed.BlockPackedWriter; import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; import org.apache.lucene.util.packed.PackedInts; -/** writer for {@link DiskDocValuesFormat} */ -public class DiskDocValuesConsumer extends DocValuesConsumer { +/** writer for {@link Lucene45DocValuesFormat} */ +public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Closeable { static final int BLOCK_SIZE = 16384; static final int ADDRESS_INTERVAL = 16; @@ -59,15 +60,16 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { final IndexOutput data, meta; final int maxDoc; - public DiskDocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + /** expert: Creates a new writer */ + public Lucene45DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { boolean success = false; try { String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); data = state.directory.createOutput(dataName, state.context); - CodecUtil.writeHeader(data, dataCodec, DiskDocValuesFormat.VERSION_CURRENT); + CodecUtil.writeHeader(data, dataCodec, Lucene45DocValuesFormat.VERSION_CURRENT); String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); meta = state.directory.createOutput(metaName, state.context); - CodecUtil.writeHeader(meta, metaCodec, DiskDocValuesFormat.VERSION_CURRENT); + CodecUtil.writeHeader(meta, metaCodec, Lucene45DocValuesFormat.VERSION_CURRENT); maxDoc = state.segmentInfo.getDocCount(); success = true; } finally { @@ -87,13 +89,20 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { long minValue = Long.MAX_VALUE; long maxValue = Long.MIN_VALUE; long gcd = 0; + boolean missing = false; // TODO: more efficient? HashSet uniqueValues = null; if (optimizeStorage) { uniqueValues = new HashSet<>(); for (Number nv : values) { - final long v = nv.longValue(); + final long v; + if (nv == null) { + v = 0; + missing = true; + } else { + v = nv.longValue(); + } if (gcd != 1) { if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { @@ -138,8 +147,14 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { format = DELTA_COMPRESSED; } meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.NUMERIC); + meta.writeByte(Lucene45DocValuesFormat.NUMERIC); meta.writeVInt(format); + if (missing) { + meta.writeLong(data.getFilePointer()); + writeMissingBitset(values); + } else { + meta.writeLong(-1L); + } meta.writeVInt(PackedInts.VERSION_CURRENT); meta.writeLong(data.getFilePointer()); meta.writeVLong(count); @@ -151,14 +166,15 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { meta.writeLong(gcd); final BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE); for (Number nv : values) { - quotientWriter.add((nv.longValue() - minValue) / gcd); + long value = nv == null ? 0 : nv.longValue(); + quotientWriter.add((value - minValue) / gcd); } quotientWriter.finish(); break; case DELTA_COMPRESSED: final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); for (Number nv : values) { - writer.add(nv.longValue()); + writer.add(nv == null ? 0 : nv.longValue()); } writer.finish(); break; @@ -173,7 +189,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { final int bitsRequired = PackedInts.bitsRequired(uniqueValues.size() - 1); final PackedInts.Writer ordsWriter = PackedInts.getWriterNoHeader(data, PackedInts.Format.PACKED, (int) count, bitsRequired, PackedInts.DEFAULT_BUFFER_SIZE); for (Number nv : values) { - ordsWriter.add(encode.get(nv.longValue())); + ordsWriter.add(encode.get(nv == null ? 0 : nv.longValue())); } ordsWriter.finish(); break; @@ -181,23 +197,60 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { throw new AssertionError(); } } + + // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on, + // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode) + void writeMissingBitset(Iterable values) throws IOException { + byte bits = 0; + int count = 0; + for (Object v : values) { + if (count == 8) { + data.writeByte(bits); + count = 0; + bits = 0; + } + if (v != null) { + bits |= 1 << (count & 7); + } + count++; + } + if (count > 0) { + data.writeByte(bits); + } + } @Override public void addBinaryField(FieldInfo field, Iterable values) throws IOException { // write the byte[] data meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.BINARY); + meta.writeByte(Lucene45DocValuesFormat.BINARY); int minLength = Integer.MAX_VALUE; int maxLength = Integer.MIN_VALUE; final long startFP = data.getFilePointer(); long count = 0; + boolean missing = false; for(BytesRef v : values) { - minLength = Math.min(minLength, v.length); - maxLength = Math.max(maxLength, v.length); - data.writeBytes(v.bytes, v.offset, v.length); + final int length; + if (v == null) { + length = 0; + missing = true; + } else { + length = v.length; + } + minLength = Math.min(minLength, length); + maxLength = Math.max(maxLength, length); + if (v != null) { + data.writeBytes(v.bytes, v.offset, v.length); + } count++; } meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED); + if (missing) { + meta.writeLong(data.getFilePointer()); + writeMissingBitset(values); + } else { + meta.writeLong(-1L); + } meta.writeVInt(minLength); meta.writeVInt(maxLength); meta.writeVLong(count); @@ -213,13 +266,16 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE); long addr = 0; for (BytesRef v : values) { - addr += v.length; + if (v != null) { + addr += v.length; + } writer.add(addr); } writer.finish(); } } + /** expert: writes a value dictionary for a sorted/sortedset field */ protected void addTermsDict(FieldInfo field, final Iterable values) throws IOException { // first check if its a "fixed-length" terms dict int minLength = Integer.MAX_VALUE; @@ -234,8 +290,9 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { } else { // header meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.BINARY); + meta.writeByte(Lucene45DocValuesFormat.BINARY); meta.writeVInt(BINARY_PREFIX_COMPRESSED); + meta.writeLong(-1L); // now write the bytes: sharing prefixes within a block final long startFP = data.getFilePointer(); // currently, we have to store the delta from expected for every 1/nth term @@ -279,7 +336,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { @Override public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.SORTED); + meta.writeByte(Lucene45DocValuesFormat.SORTED); addTermsDict(field, values); addNumericField(field, docToOrd, false); } @@ -287,7 +344,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { @Override public void addSortedSetField(FieldInfo field, Iterable values, Iterable docToOrdCount, Iterable ords) throws IOException { meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.SORTED_SET); + meta.writeByte(Lucene45DocValuesFormat.SORTED_SET); // write the ord -> byte[] as a binary field addTermsDict(field, values); // write the stream of ords as a numeric field @@ -296,8 +353,9 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { // write the doc -> ord count as a absolute index to the stream meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.NUMERIC); + meta.writeByte(Lucene45DocValuesFormat.NUMERIC); meta.writeVInt(DELTA_COMPRESSED); + meta.writeLong(-1L); meta.writeVInt(PackedInts.VERSION_CURRENT); meta.writeLong(data.getFilePointer()); meta.writeVLong(maxDoc); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java new file mode 100644 index 00000000000..3f3387ae244 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java @@ -0,0 +1,186 @@ +package org.apache.lucene.codecs.lucene45; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.FieldInfo.DocValuesType; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.SmallFloat; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.packed.BlockPackedWriter; +import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Lucene 4.5 DocValues format. + *

    + * Encodes the four per-document value types (Numeric,Binary,Sorted,SortedSet) with these strategies: + *

    + * {@link DocValuesType#NUMERIC NUMERIC}: + *

      + *
    • Delta-compressed: per-document integers written in blocks of 16k. For each block + * the minimum value in that block is encoded, and each entry is a delta from that + * minimum value. Each block of deltas is compressed with bitpacking. For more + * information, see {@link BlockPackedWriter}. + *
    • Table-compressed: when the number of unique values is very small (< 256), and + * when there are unused "gaps" in the range of values used (such as {@link SmallFloat}), + * a lookup table is written instead. Each per-document entry is instead the ordinal + * to this table, and those ordinals are compressed with bitpacking ({@link PackedInts}). + *
    • GCD-compressed: when all numbers share a common divisor, such as dates, the greatest + * common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics. + *
    + *

    + * {@link DocValuesType#BINARY BINARY}: + *

      + *
    • Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length. + * Each document's value can be addressed directly with multiplication ({@code docID * length}). + *
    • Variable-width Binary: one large concatenated byte[] is written, along with end addresses + * for each document. The addresses are written in blocks of 16k, with the current absolute + * start for the block, and the average (expected) delta per entry. For each document the + * deviation from the delta (actual - expected) is written. + *
    • Prefix-compressed Binary: values are written in chunks of 16, with the first value written + * completely and other values sharing prefixes. chunk addresses are written in blocks of 16k, + * with the current absolute start for the block, and the average (expected) delta per entry. + * For each chunk the deviation from the delta (actual - expected) is written. + *
    + *

    + * {@link DocValuesType#SORTED SORTED}: + *

      + *
    • Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary, + * along with the per-document ordinals written using one of the numeric strategies above. + *
    + *

    + * {@link DocValuesType#SORTED_SET SORTED_SET}: + *

      + *
    • SortedSet: a mapping of ordinals to deduplicated terms is written as Prefix-Compressed Binary, + * an ordinal list and per-document index into this list are written using the numeric strategies + * above. + *
    + *

    + * Files: + *

      + *
    1. .dvd: DocValues data
    2. + *
    3. .dvm: DocValues metadata
    4. + *
    + *
      + *
    1. + *

      The DocValues metadata or .dvm file.

      + *

      For DocValues field, this stores metadata, such as the offset into the + * DocValues data (.dvd)

      + *

      DocValues metadata (.dvm) --> Header,<Entry>NumFields

      + *
        + *
      • Entry --> NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry
      • + *
      • NumericEntry --> GCDNumericEntry | TableNumericEntry | DeltaNumericEntry
      • + *
      • GCDNumericEntry --> NumericHeader,MinValue,GCD
      • + *
      • TableNumericEntry --> NumericHeader,TableSize,{@link DataOutput#writeLong Int64}TableSize
      • + *
      • DeltaNumericEntry --> NumericHeader
      • + *
      • NumericHeader --> FieldNumber,EntryType,NumericType,MissingOffset,PackedVersion,DataOffset,Count,BlockSize
      • + *
      • BinaryEntry --> FixedBinaryEntry | VariableBinaryEntry | PrefixBinaryEntry
      • + *
      • FixedBinaryEntry --> BinaryHeader
      • + *
      • VariableBinaryEntry --> BinaryHeader,AddressOffset,PackedVersion,BlockSize
      • + *
      • PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize
      • + *
      • BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset
      • + *
      • SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry
      • + *
      • SortedSetEntry --> EntryType,BinaryEntry,NumericEntry,NumericEntry
      • + *
      • FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}
      • + *
      • EntryType,CompressionType --> {@link DataOutput#writeByte Byte}
      • + *
      • Header --> {@link CodecUtil#writeHeader CodecHeader}
      • + *
      • MinValue,GCD,MissingOffset,AddressOffset,DataOffset --> {@link DataOutput#writeLong Int64}
      • + *
      • TableSize --> {@link DataOutput#writeVInt vInt}
      • + *
      + *

      Sorted fields have two entries: a BinaryEntry with the value metadata, + * and an ordinary NumericEntry for the document-to-ord metadata.

      + *

      SortedSet fields have three entries: a BinaryEntry with the value metadata, + * and two NumericEntries for the document-to-ord-index and ordinal list metadata.

      + *

      FieldNumber of -1 indicates the end of metadata.

      + *

      EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)

      + *

      DataOffset is the pointer to the start of the data in the DocValues data (.dvd)

      + *

      NumericType indicates how Numeric values will be compressed: + *

        + *
      • 0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded + * from the minimum value within the block. + *
      • 1 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored + * using blocks of delta-encoded ints. + *
      • 2 --> table-compressed. When the number of unique numeric values is small and it would save space, + * a lookup table of unique values is written, followed by the ordinal for each document. + *
      + *

      BinaryType indicates how Binary values will be stored: + *

        + *
      • 0 --> fixed-width. All values have the same length, addressing by multiplication. + *
      • 1 -->, variable-width. An address for each value is stored. + *
      • 2 --> prefix-compressed. An address to the start of every interval'th value is stored. + *
      + *

      MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. + * If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length). + * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) + * is written for the addresses. + *

      MissingOffset points to a byte[] containing a bitset of all documents that had a value for the field. + * If its -1, then there are no missing values. + *

    2. + *

      The DocValues data or .dvd file.

      + *

      For DocValues field, this stores the actual per-document data (the heavy-lifting)

      + *

      DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData>NumFields

      + *
        + *
      • NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | GCDCompressedNumerics
      • + *
      • BinaryData --> {@link DataOutput#writeByte Byte}DataLength,Addresses
      • + *
      • SortedData --> {@link FST FST<Int64>}
      • + *
      • DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}
      • + *
      • TableCompressedNumerics --> {@link PackedInts PackedInts}
      • + *
      • GCDCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}
      • + *
      • Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}
      • + *
      + *

      SortedSet entries store the list of ordinals in their BinaryData as a + * sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.

      + *
    + * @lucene.experimental + */ +public final class Lucene45DocValuesFormat extends DocValuesFormat { + + /** Sole Constructor */ + public Lucene45DocValuesFormat() { + super("Lucene45"); + } + + @Override + public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new Lucene45DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + } + + @Override + public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { + return new Lucene45DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + } + + static final String DATA_CODEC = "Lucene45DocValuesData"; + static final String DATA_EXTENSION = "dvd"; + static final String META_CODEC = "Lucene45ValuesMetadata"; + static final String META_EXTENSION = "dvm"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + static final byte NUMERIC = 0; + static final byte BINARY = 1; + static final byte SORTED = 2; + static final byte SORTED_SET = 3; +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java new file mode 100644 index 00000000000..b12fa6d75d9 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java @@ -0,0 +1,831 @@ +package org.apache.lucene.codecs.lucene45; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.DELTA_COMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.GCD_COMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.TABLE_COMPRESSED; + +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_FIXED_UNCOMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_PREFIX_COMPRESSED; + +import java.io.Closeable; // javadocs +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.packed.BlockPackedReader; +import org.apache.lucene.util.packed.MonotonicBlockPackedReader; +import org.apache.lucene.util.packed.PackedInts; + +/** reader for {@link Lucene45DocValuesFormat} */ +public class Lucene45DocValuesProducer extends DocValuesProducer implements Closeable { + private final Map numerics; + private final Map binaries; + private final Map ords; + private final Map ordIndexes; + private final IndexInput data; + private final int maxDoc; + + // memory-resident structures + private final Map addressInstances = new HashMap(); + private final Map ordIndexInstances = new HashMap(); + + /** expert: instantiates a new reader */ + protected Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + // read in the entries from the metadata file. + IndexInput in = state.directory.openInput(metaName, state.context); + this.maxDoc = state.segmentInfo.getDocCount(); + boolean success = false; + final int version; + try { + version = CodecUtil.checkHeader(in, metaCodec, + Lucene45DocValuesFormat.VERSION_CURRENT, + Lucene45DocValuesFormat.VERSION_CURRENT); + numerics = new HashMap(); + ords = new HashMap(); + ordIndexes = new HashMap(); + binaries = new HashMap(); + readFields(in, state.fieldInfos); + + success = true; + } finally { + if (success) { + IOUtils.close(in); + } else { + IOUtils.closeWhileHandlingException(in); + } + } + + success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.openInput(dataName, state.context); + final int version2 = CodecUtil.checkHeader(data, dataCodec, + Lucene45DocValuesFormat.VERSION_CURRENT, + Lucene45DocValuesFormat.VERSION_CURRENT); + if (version != version2) { + throw new CorruptIndexException("Format versions mismatch"); + } + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this.data); + } + } + } + + private void readFields(IndexInput meta, FieldInfos infos) throws IOException { + int fieldNumber = meta.readVInt(); + while (fieldNumber != -1) { + byte type = meta.readByte(); + if (type == Lucene45DocValuesFormat.NUMERIC) { + numerics.put(fieldNumber, readNumericEntry(meta)); + } else if (type == Lucene45DocValuesFormat.BINARY) { + BinaryEntry b = readBinaryEntry(meta); + binaries.put(fieldNumber, b); + } else if (type == Lucene45DocValuesFormat.SORTED) { + // sorted = binary + numeric + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + if (meta.readByte() != Lucene45DocValuesFormat.BINARY) { + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + BinaryEntry b = readBinaryEntry(meta); + binaries.put(fieldNumber, b); + + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + NumericEntry n = readNumericEntry(meta); + ords.put(fieldNumber, n); + } else if (type == Lucene45DocValuesFormat.SORTED_SET) { + // sortedset = binary + numeric + ordIndex + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + if (meta.readByte() != Lucene45DocValuesFormat.BINARY) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + BinaryEntry b = readBinaryEntry(meta); + binaries.put(fieldNumber, b); + + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + NumericEntry n1 = readNumericEntry(meta); + ords.put(fieldNumber, n1); + + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + NumericEntry n2 = readNumericEntry(meta); + ordIndexes.put(fieldNumber, n2); + } else { + throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta); + } + fieldNumber = meta.readVInt(); + } + } + + static NumericEntry readNumericEntry(IndexInput meta) throws IOException { + NumericEntry entry = new NumericEntry(); + entry.format = meta.readVInt(); + entry.missingOffset = meta.readLong(); + entry.packedIntsVersion = meta.readVInt(); + entry.offset = meta.readLong(); + entry.count = meta.readVLong(); + entry.blockSize = meta.readVInt(); + switch(entry.format) { + case GCD_COMPRESSED: + entry.minValue = meta.readLong(); + entry.gcd = meta.readLong(); + break; + case TABLE_COMPRESSED: + if (entry.count > Integer.MAX_VALUE) { + throw new CorruptIndexException("Cannot use TABLE_COMPRESSED with more than MAX_VALUE values, input=" + meta); + } + final int uniqueValues = meta.readVInt(); + if (uniqueValues > 256) { + throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + meta); + } + entry.table = new long[uniqueValues]; + for (int i = 0; i < uniqueValues; ++i) { + entry.table[i] = meta.readLong(); + } + break; + case DELTA_COMPRESSED: + break; + default: + throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); + } + return entry; + } + + static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException { + BinaryEntry entry = new BinaryEntry(); + entry.format = meta.readVInt(); + entry.missingOffset = meta.readLong(); + entry.minLength = meta.readVInt(); + entry.maxLength = meta.readVInt(); + entry.count = meta.readVLong(); + entry.offset = meta.readLong(); + switch(entry.format) { + case BINARY_FIXED_UNCOMPRESSED: + break; + case BINARY_PREFIX_COMPRESSED: + entry.addressInterval = meta.readVInt(); + entry.addressesOffset = meta.readLong(); + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); + break; + case BINARY_VARIABLE_UNCOMPRESSED: + entry.addressesOffset = meta.readLong(); + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); + break; + default: + throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); + } + return entry; + } + + @Override + public NumericDocValues getNumeric(FieldInfo field) throws IOException { + NumericEntry entry = numerics.get(field.number); + return getNumeric(entry); + } + + LongNumericDocValues getNumeric(NumericEntry entry) throws IOException { + final IndexInput data = this.data.clone(); + data.seek(entry.offset); + + switch (entry.format) { + case DELTA_COMPRESSED: + final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); + return new LongNumericDocValues() { + @Override + public long get(long id) { + return reader.get(id); + } + }; + case GCD_COMPRESSED: + final long min = entry.minValue; + final long mult = entry.gcd; + final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); + return new LongNumericDocValues() { + @Override + public long get(long id) { + return min + mult * quotientReader.get(id); + } + }; + case TABLE_COMPRESSED: + final long table[] = entry.table; + final int bitsRequired = PackedInts.bitsRequired(table.length - 1); + final PackedInts.Reader ords = PackedInts.getDirectReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, (int) entry.count, bitsRequired); + return new LongNumericDocValues() { + @Override + public long get(long id) { + return table[(int) ords.get((int) id)]; + } + }; + default: + throw new AssertionError(); + } + } + + @Override + public BinaryDocValues getBinary(FieldInfo field) throws IOException { + BinaryEntry bytes = binaries.get(field.number); + switch(bytes.format) { + case BINARY_FIXED_UNCOMPRESSED: + return getFixedBinary(field, bytes); + case BINARY_VARIABLE_UNCOMPRESSED: + return getVariableBinary(field, bytes); + case BINARY_PREFIX_COMPRESSED: + return getCompressedBinary(field, bytes); + default: + throw new AssertionError(); + } + } + + private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) { + final IndexInput data = this.data.clone(); + + return new LongBinaryDocValues() { + @Override + public void get(long id, BytesRef result) { + long address = bytes.offset + id * bytes.maxLength; + try { + data.seek(address); + // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) + // assume "they" own the bytes after calling this! + final byte[] buffer = new byte[bytes.maxLength]; + data.readBytes(buffer, 0, buffer.length); + result.bytes = buffer; + result.offset = 0; + result.length = buffer.length; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + /** returns an address instance for variable-length binary values. + * @lucene.internal */ + protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException { + final MonotonicBlockPackedReader addresses; + synchronized (addressInstances) { + MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number); + if (addrInstance == null) { + data.seek(bytes.addressesOffset); + addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false); + addressInstances.put(field.number, addrInstance); + } + addresses = addrInstance; + } + return addresses; + } + + private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final IndexInput data = this.data.clone(); + + final MonotonicBlockPackedReader addresses = getAddressInstance(data, field, bytes); + + return new LongBinaryDocValues() { + @Override + public void get(long id, BytesRef result) { + long startAddress = bytes.offset + (id == 0 ? 0 : addresses.get(id-1)); + long endAddress = bytes.offset + addresses.get(id); + int length = (int) (endAddress - startAddress); + try { + data.seek(startAddress); + // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) + // assume "they" own the bytes after calling this! + final byte[] buffer = new byte[length]; + data.readBytes(buffer, 0, buffer.length); + result.bytes = buffer; + result.offset = 0; + result.length = length; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + /** returns an address instance for prefix-compressed binary values. + * @lucene.internal */ + protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException { + final MonotonicBlockPackedReader addresses; + final long interval = bytes.addressInterval; + synchronized (addressInstances) { + MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number); + if (addrInstance == null) { + data.seek(bytes.addressesOffset); + final long size; + if (bytes.count % interval == 0) { + size = bytes.count / interval; + } else { + size = 1L + bytes.count / interval; + } + addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false); + addressInstances.put(field.number, addrInstance); + } + addresses = addrInstance; + } + return addresses; + } + + + private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final IndexInput data = this.data.clone(); + + final MonotonicBlockPackedReader addresses = getIntervalInstance(data, field, bytes); + + return new CompressedBinaryDocValues(bytes, addresses, data); + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + final int valueCount = (int) binaries.get(field.number).count; + final BinaryDocValues binary = getBinary(field); + NumericEntry entry = ords.get(field.number); + IndexInput data = this.data.clone(); + data.seek(entry.offset); + final BlockPackedReader ordinals = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); + + return new SortedDocValues() { + + @Override + public int getOrd(int docID) { + return (int) ordinals.get(docID); + } + + @Override + public void lookupOrd(int ord, BytesRef result) { + binary.get(ord, result); + } + + @Override + public int getValueCount() { + return valueCount; + } + + @Override + public int lookupTerm(BytesRef key) { + if (binary instanceof CompressedBinaryDocValues) { + return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key); + } else { + return super.lookupTerm(key); + } + } + + @Override + public TermsEnum termsEnum() { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues)binary).getTermsEnum(); + } else { + return super.termsEnum(); + } + } + }; + } + + /** returns an address instance for sortedset ordinal lists + * @lucene.internal */ + protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException { + final MonotonicBlockPackedReader ordIndex; + synchronized (ordIndexInstances) { + MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number); + if (ordIndexInstance == null) { + data.seek(entry.offset); + ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false); + ordIndexInstances.put(field.number, ordIndexInstance); + } + ordIndex = ordIndexInstance; + } + return ordIndex; + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + final IndexInput data = this.data.clone(); + final long valueCount = binaries.get(field.number).count; + // we keep the byte[]s and list of ords on disk, these could be large + final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); + final LongNumericDocValues ordinals = getNumeric(ords.get(field.number)); + // but the addresses to the ord stream are in RAM + final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(data, field, ordIndexes.get(field.number)); + + return new SortedSetDocValues() { + long offset; + long endOffset; + + @Override + public long nextOrd() { + if (offset == endOffset) { + return NO_MORE_ORDS; + } else { + long ord = ordinals.get(offset); + offset++; + return ord; + } + } + + @Override + public void setDocument(int docID) { + offset = (docID == 0 ? 0 : ordIndex.get(docID-1)); + endOffset = ordIndex.get(docID); + } + + @Override + public void lookupOrd(long ord, BytesRef result) { + binary.get(ord, result); + } + + @Override + public long getValueCount() { + return valueCount; + } + + @Override + public long lookupTerm(BytesRef key) { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues)binary).lookupTerm(key); + } else { + return super.lookupTerm(key); + } + } + + @Override + public TermsEnum termsEnum() { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues)binary).getTermsEnum(); + } else { + return super.termsEnum(); + } + } + }; + } + + private Bits getMissingBits(final long offset) throws IOException { + if (offset == -1) { + return new Bits.MatchAllBits(maxDoc); + } else { + final IndexInput in = data.clone(); + return new Bits() { + + @Override + public boolean get(int index) { + try { + in.seek(offset + (index >> 3)); + return (in.readByte() & (1 << (index & 7))) != 0; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public int length() { + return maxDoc; + } + }; + } + } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + switch(field.getDocValuesType()) { + case SORTED_SET: + return new SortedSetDocsWithField(getSortedSet(field), maxDoc); + case SORTED: + return new SortedDocsWithField(getSorted(field), maxDoc); + case BINARY: + BinaryEntry be = binaries.get(field.number); + return getMissingBits(be.missingOffset); + case NUMERIC: + NumericEntry ne = numerics.get(field.number); + return getMissingBits(ne.missingOffset); + default: + throw new AssertionError(); + } + } + + @Override + public void close() throws IOException { + data.close(); + } + + /** metadata entry for a numeric docvalues field */ + protected static class NumericEntry { + private NumericEntry() {} + /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */ + long missingOffset; + /** offset to the actual numeric values */ + public long offset; + + int format; + /** packed ints version used to encode these numerics */ + public int packedIntsVersion; + /** count of values written */ + public long count; + /** packed ints blocksize */ + public int blockSize; + + long minValue; + long gcd; + long table[]; + } + + /** metadata entry for a binary docvalues field */ + protected static class BinaryEntry { + private BinaryEntry() {} + /** offset to the bitset representing docsWithField, or -1 if no documents have missing values */ + long missingOffset; + /** offset to the actual binary values */ + long offset; + + int format; + /** count of values written */ + public long count; + int minLength; + int maxLength; + /** offset to the addressing data that maps a value to its slice of the byte[] */ + public long addressesOffset; + /** interval of shared prefix chunks (when using prefix-compressed binary) */ + public long addressInterval; + /** packed ints version used to encode addressing information */ + public int packedIntsVersion; + /** packed ints blocksize */ + public int blockSize; + } + + // internally we compose complex dv (sorted/sortedset) from other ones + static abstract class LongNumericDocValues extends NumericDocValues { + @Override + public final long get(int docID) { + return get((long) docID); + } + + abstract long get(long id); + } + + static abstract class LongBinaryDocValues extends BinaryDocValues { + @Override + public final void get(int docID, BytesRef result) { + get((long)docID, result); + } + + abstract void get(long id, BytesRef Result); + } + + // in the compressed case, we add a few additional operations for + // more efficient reverse lookup and enumeration + static class CompressedBinaryDocValues extends LongBinaryDocValues { + final BinaryEntry bytes; + final long interval; + final long numValues; + final long numIndexValues; + final MonotonicBlockPackedReader addresses; + final IndexInput data; + final TermsEnum termsEnum; + + public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, IndexInput data) throws IOException { + this.bytes = bytes; + this.interval = bytes.addressInterval; + this.addresses = addresses; + this.data = data; + this.numValues = bytes.count; + this.numIndexValues = addresses.size(); + this.termsEnum = getTermsEnum(data); + } + + @Override + public void get(long id, BytesRef result) { + try { + termsEnum.seekExact(id); + BytesRef term = termsEnum.term(); + result.bytes = term.bytes; + result.offset = term.offset; + result.length = term.length; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + long lookupTerm(BytesRef key) { + try { + SeekStatus status = termsEnum.seekCeil(key); + if (status == SeekStatus.END) { + return -numValues-1; + } else if (status == SeekStatus.FOUND) { + return termsEnum.ord(); + } else { + return -termsEnum.ord()-1; + } + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + TermsEnum getTermsEnum() { + try { + return getTermsEnum(data.clone()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private TermsEnum getTermsEnum(final IndexInput input) throws IOException { + input.seek(bytes.offset); + + return new TermsEnum() { + private long currentOrd = -1; + // TODO: maxLength is negative when all terms are merged away... + private final BytesRef termBuffer = new BytesRef(bytes.maxLength < 0 ? 0 : bytes.maxLength); + private final BytesRef term = new BytesRef(); // TODO: paranoia? + + @Override + public BytesRef next() throws IOException { + if (doNext() == null) { + return null; + } else { + setTerm(); + return term; + } + } + + private BytesRef doNext() throws IOException { + if (++currentOrd >= numValues) { + return null; + } else { + int start = input.readVInt(); + int suffix = input.readVInt(); + input.readBytes(termBuffer.bytes, start, suffix); + termBuffer.length = start + suffix; + return termBuffer; + } + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + // binary-search just the index values to find the block, + // then scan within the block + long low = 0; + long high = numIndexValues-1; + + while (low <= high) { + long mid = (low + high) >>> 1; + doSeek(mid * interval); + int cmp = termBuffer.compareTo(text); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + // we got lucky, found an indexed term + setTerm(); + return SeekStatus.FOUND; + } + } + + if (numIndexValues == 0) { + return SeekStatus.END; + } + + // block before insertion point + long block = low-1; + doSeek(block < 0 ? -1 : block * interval); + + while (doNext() != null) { + int cmp = termBuffer.compareTo(text); + if (cmp == 0) { + setTerm(); + return SeekStatus.FOUND; + } else if (cmp > 0) { + setTerm(); + return SeekStatus.NOT_FOUND; + } + } + + return SeekStatus.END; + } + + @Override + public void seekExact(long ord) throws IOException { + doSeek(ord); + setTerm(); + } + + private void doSeek(long ord) throws IOException { + long block = ord / interval; + + if (ord >= currentOrd && block == currentOrd / interval) { + // seek within current block + } else { + // position before start of block + currentOrd = ord - ord % interval - 1; + input.seek(bytes.offset + addresses.get(block)); + } + + while (currentOrd < ord) { + doNext(); + } + } + + private void setTerm() { + // TODO: is there a cleaner way + term.bytes = new byte[termBuffer.length]; + term.offset = 0; + term.copyBytes(termBuffer); + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public long ord() throws IOException { + return currentOrd; + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() throws IOException { + return -1; + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + }; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/package.html b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/package.html new file mode 100644 index 00000000000..677c176a89b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/package.html @@ -0,0 +1,396 @@ + + + + + + + +Lucene 4.5 file format. + +

    Apache Lucene - Index File Formats

    + + +

    Introduction

    +
    +

    This document defines the index file formats used in this version of Lucene. +If you are using a different version of Lucene, please consult the copy of +docs/ that was distributed with +the version you are using.

    +

    Apache Lucene is written in Java, but several efforts are underway to write +versions of +Lucene in other programming languages. If these versions are to remain +compatible with Apache Lucene, then a language-independent definition of the +Lucene index format is required. This document thus attempts to provide a +complete and independent definition of the Apache Lucene file formats.

    +

    As Lucene evolves, this document should evolve. Versions of Lucene in +different programming languages should endeavor to agree on file formats, and +generate new versions of this document.

    +
    + +

    Definitions

    +
    +

    The fundamental concepts in Lucene are index, document, field and term.

    +

    An index contains a sequence of documents.

    +
      +
    • A document is a sequence of fields.
    • +
    • A field is a named sequence of terms.
    • +
    • A term is a sequence of bytes.
    • +
    +

    The same sequence of bytes in two different fields is considered a different +term. Thus terms are represented as a pair: the string naming the field, and the +bytes within the field.

    + +

    Inverted Indexing

    +

    The index stores statistics about terms in order to make term-based search +more efficient. Lucene's index falls into the family of indexes known as an +inverted index. This is because it can list, for a term, the documents +that contain it. This is the inverse of the natural relationship, in which +documents list terms.

    + +

    Types of Fields

    +

    In Lucene, fields may be stored, in which case their text is stored +in the index literally, in a non-inverted manner. Fields that are inverted are +called indexed. A field may be both stored and indexed.

    +

    The text of a field may be tokenized into terms to be indexed, or the +text of a field may be used literally as a term to be indexed. Most fields are +tokenized, but sometimes it is useful for certain identifier fields to be +indexed literally.

    +

    See the {@link org.apache.lucene.document.Field Field} +java docs for more information on Fields.

    + +

    Segments

    +

    Lucene indexes may be composed of multiple sub-indexes, or segments. +Each segment is a fully independent index, which could be searched separately. +Indexes evolve by:

    +
      +
    1. Creating new segments for newly added documents.
    2. +
    3. Merging existing segments.
    4. +
    +

    Searches may involve multiple segments and/or multiple indexes, each index +potentially composed of a set of segments.

    + +

    Document Numbers

    +

    Internally, Lucene refers to documents by an integer document number. +The first document added to an index is numbered zero, and each subsequent +document added gets a number one greater than the previous.

    +

    Note that a document's number may change, so caution should be taken when +storing these numbers outside of Lucene. In particular, numbers may change in +the following situations:

    +
      +
    • +

      The numbers stored in each segment are unique only within the segment, and +must be converted before they can be used in a larger context. The standard +technique is to allocate each segment a range of values, based on the range of +numbers used in that segment. To convert a document number from a segment to an +external value, the segment's base document number is added. To convert +an external value back to a segment-specific value, the segment is identified +by the range that the external value is in, and the segment's base value is +subtracted. For example two five document segments might be combined, so that +the first segment has a base value of zero, and the second of five. Document +three from the second segment would have an external value of eight.

      +
    • +
    • +

      When documents are deleted, gaps are created in the numbering. These are +eventually removed as the index evolves through merging. Deleted documents are +dropped when segments are merged. A freshly-merged segment thus has no gaps in +its numbering.

      +
    • +
    +
    + +

    Index Structure Overview

    +
    +

    Each segment index maintains the following:

    +
      +
    • +{@link org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat Segment info}. + This contains metadata about a segment, such as the number of documents, + what files it uses, +
    • +
    • +{@link org.apache.lucene.codecs.lucene42.Lucene42FieldInfosFormat Field names}. + This contains the set of field names used in the index. +
    • +
    • +{@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Stored Field values}. +This contains, for each document, a list of attribute-value pairs, where the attributes +are field names. These are used to store auxiliary information about the document, such as +its title, url, or an identifier to access a database. The set of stored fields are what is +returned for each hit when searching. This is keyed by document number. +
    • +
    • +{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term dictionary}. +A dictionary containing all of the terms used in all of the +indexed fields of all of the documents. The dictionary also contains the number +of documents which contain the term, and pointers to the term's frequency and +proximity data. +
    • +
    • +{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Frequency data}. +For each term in the dictionary, the numbers of all the +documents that contain that term, and the frequency of the term in that +document, unless frequencies are omitted (IndexOptions.DOCS_ONLY) +
    • +
    • +{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Proximity data}. +For each term in the dictionary, the positions that the +term occurs in each document. Note that this will not exist if all fields in +all documents omit position data. +
    • +
    • +{@link org.apache.lucene.codecs.lucene42.Lucene42NormsFormat Normalization factors}. +For each field in each document, a value is stored +that is multiplied into the score for hits on that field. +
    • +
    • +{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vectors}. +For each field in each document, the term vector (sometimes +called document vector) may be stored. A term vector consists of term text and +term frequency. To add Term Vectors to your index see the +{@link org.apache.lucene.document.Field Field} constructors +
    • +
    • +{@link org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat Per-document values}. +Like stored values, these are also keyed by document +number, but are generally intended to be loaded into main memory for fast +access. Whereas stored values are generally intended for summary results from +searches, per-document values are useful for things like scoring factors. +
    • +
    • +{@link org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat Deleted documents}. +An optional file indicating which documents are deleted. +
    • +
    +

    Details on each of these are provided in their linked pages.

    +
    + +

    File Naming

    +
    +

    All files belonging to a segment have the same name with varying extensions. +The extensions correspond to the different file formats described below. When +using the Compound File format (default in 1.4 and greater) these files (except +for the Segment info file, the Lock file, and Deleted documents file) are collapsed +into a single .cfs file (see below for details)

    +

    Typically, all segments in an index are stored in a single directory, +although this is not required.

    +

    As of version 2.1 (lock-less commits), file names are never re-used (there +is one exception, "segments.gen", see below). That is, when any file is saved +to the Directory it is given a never before used filename. This is achieved +using a simple generations approach. For example, the first segments file is +segments_1, then segments_2, etc. The generation is a sequential long integer +represented in alpha-numeric (base 36) form.

    +
    + +

    Summary of File Extensions

    +
    +

    The following table summarizes the names and extensions of the files in +Lucene:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameExtensionBrief Description
    {@link org.apache.lucene.index.SegmentInfos Segments File}segments.gen, segments_NStores information about a commit point
    Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same +file.
    {@link org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat Segment Info}.siStores metadata about a segment
    {@link org.apache.lucene.store.CompoundFileDirectory Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for +systems that frequently run out of file handles.
    {@link org.apache.lucene.codecs.lucene42.Lucene42FieldInfosFormat Fields}.fnmStores information about the fields
    {@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Field Index}.fdxContains pointers to field data
    {@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Field Data}.fdtThe stored fields for documents
    {@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
    {@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Index}.tipThe index into the Term Dictionary
    {@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
    {@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Positions}.posStores position information about where a term occurs in the index
    {@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
    {@link org.apache.lucene.codecs.lucene42.Lucene42NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
    {@link org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
    {@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
    {@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Documents}.tvdContains information about each document that has term vectors
    {@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Fields}.tvfThe field level info about term vectors
    {@link org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat Deleted Documents}.delInfo about what files are deleted
    +
    + +

    Lock File

    +The write lock, which is stored in the index directory by default, is named +"write.lock". If the lock directory is different from the index directory then +the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix +derived from the full path to the index directory. When this file is present, a +writer is currently modifying the index (adding or removing documents). This +lock file ensures that only one writer is modifying the index at a time.

    + +

    History

    +

    Compatibility notes are provided in this document, describing how file +formats have changed from prior versions:

    +
      +
    • In version 2.1, the file format was changed to allow lock-less commits (ie, +no more commit lock). The change is fully backwards compatible: you can open a +pre-2.1 index for searching or adding/deleting of docs. When the new segments +file is saved (committed), it will be written in the new file format (meaning +no specific "upgrade" process is needed). But note that once a commit has +occurred, pre-2.1 Lucene will not be able to read the index.
    • +
    • In version 2.3, the file format was changed to allow segments to share a +single set of doc store (vectors & stored fields) files. This allows for +faster indexing in certain cases. The change is fully backwards compatible (in +the same way as the lock-less commits change in 2.1).
    • +
    • In version 2.4, Strings are now written as true UTF-8 byte sequence, not +Java's modified UTF-8. See +LUCENE-510 for details.
    • +
    • In version 2.9, an optional opaque Map<String,String> CommitUserData +may be passed to IndexWriter's commit methods (and later retrieved), which is +recorded in the segments_N file. See +LUCENE-1382 for details. Also, +diagnostics were added to each segment written recording details about why it +was written (due to flush, merge; which OS/JRE was used; etc.). See issue +LUCENE-1654 for details.
    • +
    • In version 3.0, compressed fields are no longer written to the index (they +can still be read, but on merge the new segment will write them, uncompressed). +See issue LUCENE-1960 +for details.
    • +
    • In version 3.1, segments records the code version that created them. See +LUCENE-2720 for details. +Additionally segments track explicitly whether or not they have term vectors. +See LUCENE-2811 +for details.
    • +
    • In version 3.2, numeric fields are written as natively to stored fields +file, previously they were stored in text format only.
    • +
    • In version 3.4, fields can omit position data while still indexing term +frequencies.
    • +
    • In version 4.0, the format of the inverted index became extensible via +the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage +({@code DocValues}) was introduced. Normalization factors need no longer be a +single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. +Terms need not be unicode strings, they can be any byte sequence. Term offsets +can optionally be indexed into the postings lists. Payloads can be stored in the +term vectors.
    • +
    • In version 4.1, the format of the postings list changed to use either +of FOR compression or variable-byte encoding, depending upon the frequency +of the term. Terms appearing only once were changed to inline directly into +the term dictionary. Stored fields are compressed by default.
    • +
    • In version 4.2, term vectors are compressed by default. DocValues has +a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining +on multi-valued fields.
    • +
    • In version 4.5, DocValues were extended to explicitly represent missing values.
    • +
    + +

    Limitations

    +
    +

    Lucene uses a Java int to refer to +document numbers, and the index file format uses an Int32 +on-disk to store document numbers. This is a limitation +of both the index file format and the current implementation. Eventually these +should be replaced with either UInt64 values, or +better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.

    +
    + + diff --git a/lucene/core/src/java/org/apache/lucene/codecs/package.html b/lucene/core/src/java/org/apache/lucene/codecs/package.html index f0f12b42b75..16f53b5cdde 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/package.html +++ b/lucene/core/src/java/org/apache/lucene/codecs/package.html @@ -61,9 +61,13 @@ name of your codec. If you just want to customise the {@link org.apache.lucene.codecs.PostingsFormat}, or use different postings formats for different fields, then you can register your custom postings format in the same way (in META-INF/services/org.apache.lucene.codecs.PostingsFormat), and then extend the default - {@link org.apache.lucene.codecs.lucene42.Lucene42Codec} and override - {@link org.apache.lucene.codecs.lucene42.Lucene42Codec#getPostingsFormatForField(String)} to return your custom + {@link org.apache.lucene.codecs.lucene45.Lucene45Codec} and override + {@link org.apache.lucene.codecs.lucene45.Lucene45Codec#getPostingsFormatForField(String)} to return your custom postings format.

    +

    + Similarly, if you just want to customise the {@link org.apache.lucene.codecs.DocValuesFormat} per-field, have + a look at {@link org.apache.lucene.codecs.lucene45.Lucene45Codec#getDocValuesFormatForField(String)}. +

    diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java index 72053a8b3a1..3ed6797e783 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java @@ -36,6 +36,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; @@ -265,6 +266,12 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { DocValuesProducer producer = fields.get(field.name); return producer == null ? null : producer.getSortedSet(field); } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + DocValuesProducer producer = fields.get(field.name); + return producer == null ? null : producer.getDocsWithField(field); + } @Override public void close() throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java b/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java index 1b0e4168969..a3e28e85c11 100644 --- a/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java @@ -207,6 +207,12 @@ public abstract class AtomicReader extends IndexReader { * this field. The returned instance should only be * used by a single thread. */ public abstract SortedSetDocValues getSortedSetDocValues(String field) throws IOException; + + /** Returns a {@link Bits} at the size of reader.maxDoc(), + * with turned on bits for each docid that does have a value for this field, + * or null if no DocValues were indexed for this field. The + * returned instance should only be used by a single thread */ + public abstract Bits getDocsWithField(String field) throws IOException; /** Returns {@link NumericDocValues} representing norms * for this field, or null if no {@link NumericDocValues} diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java index 5eab02d417c..f9f82317b62 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java @@ -26,6 +26,8 @@ import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Counter; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer; import org.apache.lucene.util.packed.PackedInts; @@ -49,6 +51,7 @@ class BinaryDocValuesWriter extends DocValuesWriter { private final Counter iwBytesUsed; private final AppendingDeltaPackedLongBuffer lengths; + private final OpenBitSet docsWithField; private final FieldInfo fieldInfo; private int addedValues; private long bytesUsed; @@ -59,6 +62,9 @@ class BinaryDocValuesWriter extends DocValuesWriter { this.bytesOut = bytes.getDataOutput(); this.lengths = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); this.iwBytesUsed = iwBytesUsed; + this.docsWithField = new OpenBitSet(); + this.bytesUsed = docsWithFieldBytesUsed(); + iwBytesUsed.addAndGet(bytesUsed); } public void addValue(int docID, BytesRef value) { @@ -85,11 +91,17 @@ class BinaryDocValuesWriter extends DocValuesWriter { // Should never happen! throw new RuntimeException(ioe); } + docsWithField.set(docID); updateBytesUsed(); } + + private long docsWithFieldBytesUsed() { + // size of the long[] + some overhead + return RamUsageEstimator.sizeOf(docsWithField.getBits()) + 64; + } private void updateBytesUsed() { - final long newBytesUsed = lengths.ramBytesUsed() + bytes.ramBytesUsed(); + final long newBytesUsed = lengths.ramBytesUsed() + bytes.ramBytesUsed() + docsWithFieldBytesUsed(); iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); bytesUsed = newBytesUsed; } @@ -138,6 +150,7 @@ class BinaryDocValuesWriter extends DocValuesWriter { if (!hasNext()) { throw new NoSuchElementException(); } + final BytesRef v; if (upto < size) { int length = (int) lengthsIterator.next(); value.grow(length); @@ -148,13 +161,16 @@ class BinaryDocValuesWriter extends DocValuesWriter { // Should never happen! throw new RuntimeException(ioe); } + if (docsWithField.get(upto)) { + v = value; + } else { + v = null; + } } else { - // This is to handle last N documents not having - // this DV field in the end of the segment: - value.length = 0; + v = null; } upto++; - return value; + return v; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 27a1aface60..d0f7bed509e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1280,7 +1280,8 @@ public class CheckIndex { if (reader.getBinaryDocValues(fieldInfo.name) != null || reader.getNumericDocValues(fieldInfo.name) != null || reader.getSortedDocValues(fieldInfo.name) != null || - reader.getSortedSetDocValues(fieldInfo.name) != null) { + reader.getSortedSetDocValues(fieldInfo.name) != null || + reader.getDocsWithField(fieldInfo.name) != null) { throw new RuntimeException("field: " + fieldInfo.name + " has docvalues but should omit them!"); } } @@ -1301,26 +1302,37 @@ public class CheckIndex { return status; } - private static void checkBinaryDocValues(String fieldName, AtomicReader reader, BinaryDocValues dv) { + private static void checkBinaryDocValues(String fieldName, AtomicReader reader, BinaryDocValues dv, Bits docsWithField) { BytesRef scratch = new BytesRef(); for (int i = 0; i < reader.maxDoc(); i++) { dv.get(i, scratch); assert scratch.isValid(); + if (docsWithField.get(i) == false && scratch.length > 0) { + throw new RuntimeException("dv for field: " + fieldName + " is missing but has value=" + scratch + " for doc: " + i); + } } } - private static void checkSortedDocValues(String fieldName, AtomicReader reader, SortedDocValues dv) { - checkBinaryDocValues(fieldName, reader, dv); + private static void checkSortedDocValues(String fieldName, AtomicReader reader, SortedDocValues dv, Bits docsWithField) { + checkBinaryDocValues(fieldName, reader, dv, docsWithField); final int maxOrd = dv.getValueCount()-1; FixedBitSet seenOrds = new FixedBitSet(dv.getValueCount()); int maxOrd2 = -1; for (int i = 0; i < reader.maxDoc(); i++) { int ord = dv.getOrd(i); - if (ord < 0 || ord > maxOrd) { + if (ord == -1) { + if (docsWithField.get(i)) { + throw new RuntimeException("dv for field: " + fieldName + " has -1 ord but is not marked missing for doc: " + i); + } + } else if (ord < -1 || ord > maxOrd) { throw new RuntimeException("ord out of bounds: " + ord); + } else { + if (!docsWithField.get(i)) { + throw new RuntimeException("dv for field: " + fieldName + " is missing but has ord=" + ord + " for doc: " + i); + } + maxOrd2 = Math.max(maxOrd2, ord); + seenOrds.set(ord); } - maxOrd2 = Math.max(maxOrd2, ord); - seenOrds.set(ord); } if (maxOrd != maxOrd2) { throw new RuntimeException("dv for field: " + fieldName + " reports wrong maxOrd=" + maxOrd + " but this is not the case: " + maxOrd2); @@ -1342,7 +1354,7 @@ public class CheckIndex { } } - private static void checkSortedSetDocValues(String fieldName, AtomicReader reader, SortedSetDocValues dv) { + private static void checkSortedSetDocValues(String fieldName, AtomicReader reader, SortedSetDocValues dv, Bits docsWithField) { final long maxOrd = dv.getValueCount()-1; OpenBitSet seenOrds = new OpenBitSet(dv.getValueCount()); long maxOrd2 = -1; @@ -1350,16 +1362,28 @@ public class CheckIndex { dv.setDocument(i); long lastOrd = -1; long ord; - while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { - if (ord <= lastOrd) { - throw new RuntimeException("ords out of order: " + ord + " <= " + lastOrd + " for doc: " + i); + if (docsWithField.get(i)) { + int ordCount = 0; + while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + ordCount++; + if (ord <= lastOrd) { + throw new RuntimeException("ords out of order: " + ord + " <= " + lastOrd + " for doc: " + i); + } + if (ord < 0 || ord > maxOrd) { + throw new RuntimeException("ord out of bounds: " + ord); + } + lastOrd = ord; + maxOrd2 = Math.max(maxOrd2, ord); + seenOrds.set(ord); } - if (ord < 0 || ord > maxOrd) { - throw new RuntimeException("ord out of bounds: " + ord); + if (ordCount == 0) { + throw new RuntimeException("dv for field: " + fieldName + " has no ordinals but is not marked missing for doc: " + i); + } + } else { + long o = dv.nextOrd(); + if (o != SortedSetDocValues.NO_MORE_ORDS) { + throw new RuntimeException("dv for field: " + fieldName + " is marked missing but has ord=" + o + " for doc: " + i); } - lastOrd = ord; - maxOrd2 = Math.max(maxOrd2, ord); - seenOrds.set(ord); } } if (maxOrd != maxOrd2) { @@ -1383,17 +1407,26 @@ public class CheckIndex { } } - private static void checkNumericDocValues(String fieldName, AtomicReader reader, NumericDocValues ndv) { + private static void checkNumericDocValues(String fieldName, AtomicReader reader, NumericDocValues ndv, Bits docsWithField) { for (int i = 0; i < reader.maxDoc(); i++) { - ndv.get(i); + long value = ndv.get(i); + if (docsWithField.get(i) == false && value != 0) { + throw new RuntimeException("dv for field: " + fieldName + " is marked missing but has value=" + value + " for doc: " + i); + } } } private static void checkDocValues(FieldInfo fi, AtomicReader reader, PrintStream infoStream, DocValuesStatus status) throws Exception { + Bits docsWithField = reader.getDocsWithField(fi.name); + if (docsWithField == null) { + throw new RuntimeException(fi.name + " docsWithField does not exist"); + } else if (docsWithField.length() != reader.maxDoc()) { + throw new RuntimeException(fi.name + " docsWithField has incorrect length: " + docsWithField.length() + ",expected: " + reader.maxDoc()); + } switch(fi.getDocValuesType()) { case SORTED: status.totalSortedFields++; - checkSortedDocValues(fi.name, reader, reader.getSortedDocValues(fi.name)); + checkSortedDocValues(fi.name, reader, reader.getSortedDocValues(fi.name), docsWithField); if (reader.getBinaryDocValues(fi.name) != null || reader.getNumericDocValues(fi.name) != null || reader.getSortedSetDocValues(fi.name) != null) { @@ -1402,7 +1435,7 @@ public class CheckIndex { break; case SORTED_SET: status.totalSortedSetFields++; - checkSortedSetDocValues(fi.name, reader, reader.getSortedSetDocValues(fi.name)); + checkSortedSetDocValues(fi.name, reader, reader.getSortedSetDocValues(fi.name), docsWithField); if (reader.getBinaryDocValues(fi.name) != null || reader.getNumericDocValues(fi.name) != null || reader.getSortedDocValues(fi.name) != null) { @@ -1411,7 +1444,7 @@ public class CheckIndex { break; case BINARY: status.totalBinaryFields++; - checkBinaryDocValues(fi.name, reader, reader.getBinaryDocValues(fi.name)); + checkBinaryDocValues(fi.name, reader, reader.getBinaryDocValues(fi.name), docsWithField); if (reader.getNumericDocValues(fi.name) != null || reader.getSortedDocValues(fi.name) != null || reader.getSortedSetDocValues(fi.name) != null) { @@ -1420,7 +1453,7 @@ public class CheckIndex { break; case NUMERIC: status.totalNumericFields++; - checkNumericDocValues(fi.name, reader, reader.getNumericDocValues(fi.name)); + checkNumericDocValues(fi.name, reader, reader.getNumericDocValues(fi.name), docsWithField); if (reader.getBinaryDocValues(fi.name) != null || reader.getSortedDocValues(fi.name) != null || reader.getSortedSetDocValues(fi.name) != null) { @@ -1435,7 +1468,7 @@ public class CheckIndex { private static void checkNorms(FieldInfo fi, AtomicReader reader, PrintStream infoStream) throws IOException { switch(fi.getNormType()) { case NUMERIC: - checkNumericDocValues(fi.name, reader, reader.getNormValues(fi.name)); + checkNumericDocValues(fi.name, reader, reader.getNormValues(fi.name), new Bits.MatchAllBits(reader.maxDoc())); break; default: throw new AssertionError("wtf: " + fi.getNormType()); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java index 90f2e4514f6..cb1b30154dd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java @@ -143,7 +143,7 @@ final class DocValuesProcessor extends StoredFieldsConsumer { DocValuesWriter writer = writers.get(fieldInfo.name); NumericDocValuesWriter numericWriter; if (writer == null) { - numericWriter = new NumericDocValuesWriter(fieldInfo, bytesUsed); + numericWriter = new NumericDocValuesWriter(fieldInfo, bytesUsed, true); writers.put(fieldInfo.name, numericWriter); } else if (!(writer instanceof NumericDocValuesWriter)) { throw new IllegalArgumentException("Incompatible DocValues type: field \"" + fieldInfo.name + "\" changed from " + getTypeDesc(writer) + " to numeric"); diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java index 93be7a66105..4a8a55a3433 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java @@ -414,4 +414,10 @@ public class FilterAtomicReader extends AtomicReader { return in.getNormValues(field); } + @Override + public Bits getDocsWithField(String field) throws IOException { + ensureOpen(); + return in.getDocsWithField(field); + } + } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java index 8f262a8bb5e..1e6671ea05c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java @@ -22,6 +22,7 @@ import java.util.List; import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex; import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.packed.AppendingPackedLongBuffer; import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; @@ -135,6 +136,51 @@ public class MultiDocValues { }; } } + + /** Returns a Bits for a reader's docsWithField (potentially merging on-the-fly) + *

    + * This is a slow way to access this bitset. Instead, access them per-segment + * with {@link AtomicReader#getDocsWithField(String)} + *

    + * */ + public static Bits getDocsWithField(final IndexReader r, final String field) throws IOException { + final List leaves = r.leaves(); + final int size = leaves.size(); + if (size == 0) { + return null; + } else if (size == 1) { + return leaves.get(0).reader().getDocsWithField(field); + } + + boolean anyReal = false; + boolean anyMissing = false; + final Bits[] values = new Bits[size]; + final int[] starts = new int[size+1]; + for (int i = 0; i < size; i++) { + AtomicReaderContext context = leaves.get(i); + Bits v = context.reader().getDocsWithField(field); + if (v == null) { + v = new Bits.MatchNoBits(context.reader().maxDoc()); + anyMissing = true; + } else { + anyReal = true; + if (v instanceof Bits.MatchAllBits == false) { + anyMissing = true; + } + } + values[i] = v; + starts[i] = context.docBase; + } + starts[size] = r.maxDoc(); + + if (!anyReal) { + return null; + } else if (!anyMissing) { + return new Bits.MatchAllBits(r.maxDoc()); + } else { + return new MultiBits(values, starts, false); + } + } /** Returns a BinaryDocValues for a reader's docvalues (potentially merging on-the-fly) *

    diff --git a/lucene/core/src/java/org/apache/lucene/index/NormsConsumerPerField.java b/lucene/core/src/java/org/apache/lucene/index/NormsConsumerPerField.java index 4a3219eaa23..724c9ed05e0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NormsConsumerPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/NormsConsumerPerField.java @@ -44,7 +44,7 @@ final class NormsConsumerPerField extends InvertedDocEndConsumerPerField impleme if (fieldInfo.isIndexed() && !fieldInfo.omitsNorms()) { if (consumer == null) { fieldInfo.setNormValueType(FieldInfo.DocValuesType.NUMERIC); - consumer = new NumericDocValuesWriter(fieldInfo, docState.docWriter.bytesUsed); + consumer = new NumericDocValuesWriter(fieldInfo, docState.docWriter.bytesUsed, false); } consumer.addValue(docState.docID, similarity.computeNorm(fieldState)); } diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java index cc070830927..08f065e1df2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java @@ -23,6 +23,8 @@ import java.util.NoSuchElementException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.util.Counter; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer; import org.apache.lucene.util.packed.PackedInts; @@ -35,14 +37,18 @@ class NumericDocValuesWriter extends DocValuesWriter { private AppendingDeltaPackedLongBuffer pending; private final Counter iwBytesUsed; private long bytesUsed; + private final OpenBitSet docsWithField; private final FieldInfo fieldInfo; + private final boolean trackDocsWithField; - public NumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { + public NumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed, boolean trackDocsWithField) { pending = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); - bytesUsed = pending.ramBytesUsed(); + docsWithField = new OpenBitSet(); + bytesUsed = pending.ramBytesUsed() + docsWithFieldBytesUsed(); this.fieldInfo = fieldInfo; this.iwBytesUsed = iwBytesUsed; iwBytesUsed.addAndGet(bytesUsed); + this.trackDocsWithField = trackDocsWithField; } public void addValue(int docID, long value) { @@ -56,12 +62,20 @@ class NumericDocValuesWriter extends DocValuesWriter { } pending.add(value); + if (trackDocsWithField) { + docsWithField.set(docID); + } updateBytesUsed(); } + + private long docsWithFieldBytesUsed() { + // size of the long[] + some overhead + return RamUsageEstimator.sizeOf(docsWithField.getBits()) + 64; + } private void updateBytesUsed() { - final long newBytesUsed = pending.ramBytesUsed(); + final long newBytesUsed = pending.ramBytesUsed() + docsWithFieldBytesUsed(); iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); bytesUsed = newBytesUsed; } @@ -109,14 +123,18 @@ class NumericDocValuesWriter extends DocValuesWriter { if (!hasNext()) { throw new NoSuchElementException(); } - long value; + Long value; if (upto < size) { - value = iter.next(); + long v = iter.next(); + if (!trackDocsWithField || docsWithField.get(upto)) { + value = v; + } else { + value = null; + } } else { - value = 0; + value = trackDocsWithField ? null : MISSING; } upto++; - // TODO: make reusable Number return value; } diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java index c7174d8e7b5..cbc4bbd7f3e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java @@ -285,6 +285,13 @@ public class ParallelAtomicReader extends AtomicReader { return reader == null ? null : reader.getSortedSetDocValues(field); } + @Override + public Bits getDocsWithField(String field) throws IOException { + ensureOpen(); + AtomicReader reader = fieldToReader.get(field); + return reader == null ? null : reader.getDocsWithField(field); + } + @Override public NumericDocValues getNormValues(String field) throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java index ab0348293a9..3a526aad54f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java @@ -36,6 +36,7 @@ import org.apache.lucene.index.SegmentReader.CoreClosedListener; import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.util.IOUtils; @@ -87,6 +88,13 @@ final class SegmentCoreReaders { return new HashMap(); } }; + + final CloseableThreadLocal> docsWithFieldLocal = new CloseableThreadLocal>() { + @Override + protected Map initialValue() { + return new HashMap(); + } + }; final CloseableThreadLocal> normsLocal = new CloseableThreadLocal>() { @Override @@ -274,6 +282,30 @@ final class SegmentCoreReaders { return dvs; } + + Bits getDocsWithField(String field) throws IOException { + FieldInfo fi = fieldInfos.fieldInfo(field); + if (fi == null) { + // Field does not exist + return null; + } + if (fi.getDocValuesType() == null) { + // Field was not indexed with doc values + return null; + } + + assert dvProducer != null; + + Map dvFields = docsWithFieldLocal.get(); + + Bits dvs = dvFields.get(field); + if (dvs == null) { + dvs = dvProducer.getDocsWithField(fi); + dvFields.put(field, dvs); + } + + return dvs; + } NumericDocValues getNormValues(String field) throws IOException { FieldInfo fi = fieldInfos.fieldInfo(field); @@ -300,8 +332,8 @@ final class SegmentCoreReaders { void decRef() throws IOException { if (ref.decrementAndGet() == 0) { - IOUtils.close(termVectorsLocal, fieldsReaderLocal, docValuesLocal, normsLocal, fields, dvProducer, - termVectorsReaderOrig, fieldsReaderOrig, cfsReader, normsProducer); + IOUtils.close(termVectorsLocal, fieldsReaderLocal, docValuesLocal, normsLocal, docsWithFieldLocal, fields, + dvProducer, termVectorsReaderOrig, fieldsReaderOrig, cfsReader, normsProducer); notifyCoreClosedListeners(); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index f121e85b10f..718687bcc85 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -30,6 +30,7 @@ import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.index.FieldInfo.DocValuesType; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; @@ -156,24 +157,32 @@ final class SegmentMerger { if (type != null) { if (type == DocValuesType.NUMERIC) { List toMerge = new ArrayList(); + List docsWithField = new ArrayList(); for (AtomicReader reader : mergeState.readers) { NumericDocValues values = reader.getNumericDocValues(field.name); + Bits bits = reader.getDocsWithField(field.name); if (values == null) { values = NumericDocValues.EMPTY; + bits = new Bits.MatchNoBits(reader.maxDoc()); } toMerge.add(values); + docsWithField.add(bits); } - consumer.mergeNumericField(field, mergeState, toMerge); + consumer.mergeNumericField(field, mergeState, toMerge, docsWithField); } else if (type == DocValuesType.BINARY) { List toMerge = new ArrayList(); + List docsWithField = new ArrayList(); for (AtomicReader reader : mergeState.readers) { BinaryDocValues values = reader.getBinaryDocValues(field.name); + Bits bits = reader.getDocsWithField(field.name); if (values == null) { values = BinaryDocValues.EMPTY; + bits = new Bits.MatchNoBits(reader.maxDoc()); } toMerge.add(values); + docsWithField.add(bits); } - consumer.mergeBinaryField(field, mergeState, toMerge); + consumer.mergeBinaryField(field, mergeState, toMerge, docsWithField); } else if (type == DocValuesType.SORTED) { List toMerge = new ArrayList(); for (AtomicReader reader : mergeState.readers) { @@ -216,14 +225,16 @@ final class SegmentMerger { for (FieldInfo field : mergeState.fieldInfos) { if (field.hasNorms()) { List toMerge = new ArrayList(); + List docsWithField = new ArrayList(); for (AtomicReader reader : mergeState.readers) { NumericDocValues norms = reader.getNormValues(field.name); if (norms == null) { norms = NumericDocValues.EMPTY; } toMerge.add(norms); + docsWithField.add(new Bits.MatchAllBits(reader.maxDoc())); } - consumer.mergeNumericField(field, mergeState, toMerge); + consumer.mergeNumericField(field, mergeState, toMerge, docsWithField); } } success = true; diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java index 8214a980cd9..c6cf702955b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java @@ -223,6 +223,12 @@ public final class SegmentReader extends AtomicReader { return core.getNumericDocValues(field); } + @Override + public Bits getDocsWithField(String field) throws IOException { + ensureOpen(); + return core.getDocsWithField(field); + } + @Override public BinaryDocValues getBinaryDocValues(String field) throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java index bce0ef64257..b7af7d0ba88 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java @@ -91,6 +91,12 @@ public final class SlowCompositeReaderWrapper extends AtomicReader { return MultiDocValues.getNumericValues(in, field); } + @Override + public Bits getDocsWithField(String field) throws IOException { + ensureOpen(); + return MultiDocValues.getDocsWithField(in, field); + } + @Override public BinaryDocValues getBinaryDocValues(String field) throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java index c7dae5b3dd9..df36931a253 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java @@ -37,12 +37,13 @@ public abstract class SortedDocValues extends BinaryDocValues { * Returns the ordinal for the specified docID. * @param docID document ID to lookup * @return ordinal for the document: this is dense, starts at 0, then - * increments by 1 for the next value in sorted order. + * increments by 1 for the next value in sorted order. Note that + * missing values are indicated by -1. */ public abstract int getOrd(int docID); /** Retrieves the value for the specified ordinal. - * @param ord ordinal to lookup + * @param ord ordinal to lookup (must be >= 0 and < {@link #getValueCount()}) * @param result will be populated with the ordinal's value * @see #getOrd(int) */ @@ -71,7 +72,7 @@ public abstract class SortedDocValues extends BinaryDocValues { public static final SortedDocValues EMPTY = new SortedDocValues() { @Override public int getOrd(int docID) { - return 0; + return -1; } @Override @@ -83,7 +84,7 @@ public abstract class SortedDocValues extends BinaryDocValues { @Override public int getValueCount() { - return 1; + return 0; } }; diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java index d337a0ca1c5..4d42a2e15c9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java @@ -30,19 +30,19 @@ import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.Counter; import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.packed.AppendingPackedLongBuffer; +import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer; import org.apache.lucene.util.packed.PackedInts; /** Buffers up pending byte[] per doc, deref and sorting via * int ord, then flushes when segment flushes. */ class SortedDocValuesWriter extends DocValuesWriter { final BytesRefHash hash; - private AppendingPackedLongBuffer pending; + private AppendingDeltaPackedLongBuffer pending; private final Counter iwBytesUsed; private long bytesUsed; // this currently only tracks differences in 'pending' private final FieldInfo fieldInfo; - private static final BytesRef EMPTY = new BytesRef(BytesRef.EMPTY_BYTES); + private static final int EMPTY_ORD = -1; public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { this.fieldInfo = fieldInfo; @@ -52,7 +52,7 @@ class SortedDocValuesWriter extends DocValuesWriter { new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)), BytesRefHash.DEFAULT_CAPACITY, new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed)); - pending = new AppendingPackedLongBuffer(PackedInts.COMPACT); + pending = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); bytesUsed = pending.ramBytesUsed(); iwBytesUsed.addAndGet(bytesUsed); } @@ -70,7 +70,7 @@ class SortedDocValuesWriter extends DocValuesWriter { // Fill in any holes: while(pending.size() < docID) { - addOneValue(EMPTY); + pending.add(EMPTY_ORD); } addOneValue(value); @@ -79,8 +79,9 @@ class SortedDocValuesWriter extends DocValuesWriter { @Override public void finish(int maxDoc) { while(pending.size() < maxDoc) { - addOneValue(EMPTY); + pending.add(EMPTY_ORD); } + updateBytesUsed(); } private void addOneValue(BytesRef value) { @@ -177,7 +178,7 @@ class SortedDocValuesWriter extends DocValuesWriter { // iterates over the ords for each doc we have in ram private class OrdsIterator implements Iterator { - final AppendingPackedLongBuffer.Iterator iter = pending.iterator(); + final AppendingDeltaPackedLongBuffer.Iterator iter = pending.iterator(); final int ordMap[]; final int maxDoc; int docUpto; @@ -200,8 +201,7 @@ class SortedDocValuesWriter extends DocValuesWriter { } int ord = (int) iter.next(); docUpto++; - // TODO: make reusable Number - return ordMap[ord]; + return ord == -1 ? ord : ordMap[ord]; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldCache.java b/lucene/core/src/java/org/apache/lucene/search/FieldCache.java index f87fb51acf3..cce1024ed5d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FieldCache.java +++ b/lucene/core/src/java/org/apache/lucene/search/FieldCache.java @@ -104,26 +104,6 @@ public interface FieldCache { } }; } - - /** Returns MISSING/-1 ordinal for every document */ - public static final SortedDocValues EMPTY_TERMSINDEX = new SortedDocValues() { - @Override - public int getOrd(int docID) { - return -1; - } - - @Override - public void lookupOrd(int ord, BytesRef result) { - result.bytes = MISSING; - result.offset = 0; - result.length = 0; - } - - @Override - public int getValueCount() { - return 0; - } - }; /** * Placeholder indicating creation of this cache is currently in-progress. @@ -266,13 +246,10 @@ public interface FieldCache { } }; - /** Checks the internal cache for an appropriate entry, and if none is found, * reads the terms in field and returns a bit set at the size of * reader.maxDoc(), with turned on bits for each docid that - * does have a value for this field. Note that if the field was only indexed - * as DocValues then this method will not work (it will return a Bits stating - * that no documents contain the field). + * does have a value for this field. */ public Bits getDocsWithField(AtomicReader reader, String field) throws IOException; diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java b/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java index 2dc5cb6bf9a..b8e81d17ea4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java +++ b/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java @@ -501,8 +501,7 @@ class FieldCacheImpl implements FieldCache { // field does not exist or has no value return new Bits.MatchNoBits(reader.maxDoc()); } else if (fieldInfo.hasDocValues()) { - // doc values are dense - return new Bits.MatchAllBits(reader.maxDoc()); + return reader.getDocsWithField(field); } else if (!fieldInfo.isIndexed()) { return new Bits.MatchNoBits(reader.maxDoc()); } @@ -944,13 +943,13 @@ class FieldCacheImpl implements FieldCache { } else { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { - return EMPTY_TERMSINDEX; + return SortedDocValues.EMPTY; } else if (info.hasDocValues()) { // we don't try to build a sorted instance from numeric/binary doc // values because dedup can be very costly throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } else if (!info.isIndexed()) { - return EMPTY_TERMSINDEX; + return SortedDocValues.EMPTY; } return (SortedDocValues) caches.get(SortedDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio), false); } diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 8ae12c55368..3398b343d49 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -16,3 +16,4 @@ org.apache.lucene.codecs.lucene40.Lucene40Codec org.apache.lucene.codecs.lucene41.Lucene41Codec org.apache.lucene.codecs.lucene42.Lucene42Codec +org.apache.lucene.codecs.lucene45.Lucene45Codec \ No newline at end of file diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat index c0e599e3611..262f8a2bdd5 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -14,3 +14,4 @@ # limitations under the License. org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat +org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat \ No newline at end of file diff --git a/lucene/core/src/test/org/apache/lucene/TestExternalCodecs.java b/lucene/core/src/test/org/apache/lucene/TestExternalCodecs.java index d973fb5c67c..df66d801833 100644 --- a/lucene/core/src/test/org/apache/lucene/TestExternalCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/TestExternalCodecs.java @@ -17,21 +17,27 @@ package org.apache.lucene; * limitations under the License. */ -import org.apache.lucene.analysis.*; -import org.apache.lucene.codecs.*; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; -import org.apache.lucene.document.*; -import org.apache.lucene.index.*; -import org.apache.lucene.search.*; -import org.apache.lucene.store.*; -import org.apache.lucene.util.*; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.BaseDirectoryWrapper; +import org.apache.lucene.util.LuceneTestCase; + /* Intentionally outside of oal.index to verify fully external codecs work fine */ public class TestExternalCodecs extends LuceneTestCase { - private static final class CustomPerFieldCodec extends Lucene42Codec { + private static final class CustomPerFieldCodec extends Lucene45Codec { private final PostingsFormat ramFormat = PostingsFormat.forName("RAMOnly"); private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41"); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java index d86002eb0af..e8885927644 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java @@ -24,7 +24,7 @@ import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase; * Tests Lucene42DocValuesFormat */ public class TestLucene42DocValuesFormat extends BaseCompressingDocValuesFormatTestCase { - private final Codec codec = new Lucene42Codec(); + private final Codec codec = new Lucene42RWCodec(); @Override protected Codec getCodec() { diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestCheapBastardDocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene45/TestLucene45DocValuesFormat.java similarity index 79% rename from lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestCheapBastardDocValuesFormat.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene45/TestLucene45DocValuesFormat.java index 4cd40c5cd4e..3f6171acd62 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestCheapBastardDocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene45/TestLucene45DocValuesFormat.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.diskdv; +package org.apache.lucene.codecs.lucene45; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -18,15 +18,14 @@ package org.apache.lucene.codecs.diskdv; */ import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.cheapbastard.CheapBastardDocValuesFormat; import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase; import org.apache.lucene.util._TestUtil; /** - * Tests CheapBastardDocValuesFormat + * Tests Lucene45DocValuesFormat */ -public class TestCheapBastardDocValuesFormat extends BaseCompressingDocValuesFormatTestCase { - private final Codec codec = _TestUtil.alwaysDocValuesFormat(new CheapBastardDocValuesFormat()); +public class TestLucene45DocValuesFormat extends BaseCompressingDocValuesFormatTestCase { + private final Codec codec = _TestUtil.alwaysDocValuesFormat(new Lucene45DocValuesFormat()); @Override protected Codec getCodec() { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java index 06795f005c6..43ac931c16f 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java @@ -25,7 +25,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -79,9 +79,9 @@ public class TestPerFieldDocValuesFormat extends BaseDocValuesFormatTestCase { Directory directory = newDirectory(); // we don't use RandomIndexWriter because it might add more docvalues than we expect !!!!1 IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); - final DocValuesFormat fast = DocValuesFormat.forName("Lucene42"); + final DocValuesFormat fast = DocValuesFormat.forName("Lucene45"); final DocValuesFormat slow = DocValuesFormat.forName("SimpleText"); - iwc.setCodec(new Lucene42Codec() { + iwc.setCodec(new Lucene45Codec() { @Override public DocValuesFormat getDocValuesFormatForField(String field) { if ("dv1".equals(field)) { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java index 8d9f8838fba..4bfc8115ef6 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java @@ -21,7 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; import org.apache.lucene.codecs.mocksep.MockSepPostingsFormat; import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat; @@ -200,7 +200,7 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase { } - public static class MockCodec extends Lucene42Codec { + public static class MockCodec extends Lucene45Codec { final PostingsFormat lucene40 = new Lucene41PostingsFormat(); final PostingsFormat simpleText = new SimpleTextPostingsFormat(); final PostingsFormat mockSep = new MockSepPostingsFormat(); @@ -217,7 +217,7 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase { } } - public static class MockCodec2 extends Lucene42Codec { + public static class MockCodec2 extends Lucene45Codec { final PostingsFormat lucene40 = new Lucene41PostingsFormat(); final PostingsFormat simpleText = new SimpleTextPostingsFormat(); @@ -268,7 +268,7 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase { } public void testSameCodecDifferentInstance() throws Exception { - Codec codec = new Lucene42Codec() { + Codec codec = new Lucene45Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { if ("id".equals(field)) { @@ -284,7 +284,7 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase { } public void testSameCodecDifferentParams() throws Exception { - Codec codec = new Lucene42Codec() { + Codec codec = new Lucene45Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { if ("id".equals(field)) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index d0789dca5e3..d0ea942b959 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -28,7 +28,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -1060,7 +1060,7 @@ public class TestAddIndexes extends LuceneTestCase { aux2.close(); } - private static final class CustomPerFieldCodec extends Lucene42Codec { + private static final class CustomPerFieldCodec extends Lucene45Codec { private final PostingsFormat simpleTextFormat = PostingsFormat.forName("SimpleText"); private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41"); private final PostingsFormat mockSepFormat = PostingsFormat.forName("MockSep"); @@ -1111,7 +1111,7 @@ public class TestAddIndexes extends LuceneTestCase { private static final class UnRegisteredCodec extends FilterCodec { public UnRegisteredCodec() { - super("NotRegistered", new Lucene42Codec()); + super("NotRegistered", new Lucene45Codec()); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java index 9a9f75e47d6..784301783e1 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java @@ -21,7 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexFileNames; @@ -41,7 +41,7 @@ public class TestAllFilesHaveCodecHeader extends LuceneTestCase { public void test() throws Exception { Directory dir = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); - conf.setCodec(new Lucene42Codec()); + conf.setCodec(new Lucene45Codec()); // riw should sometimes create docvalues fields, etc RandomIndexWriter riw = new RandomIndexWriter(random(), dir, conf); Document doc = new Document(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index ca851727ecf..8d9529a35bc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -74,7 +74,7 @@ import org.junit.Ignore; // we won't even be running the actual code, only the impostor // @SuppressCodecs("Lucene4x") // Sep codec cannot yet handle the offsets in our 4.x index! -@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom", "Lucene40", "Lucene41"}) +@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom", "Lucene40", "Lucene41", "Lucene42"}) public class TestBackwardsCompatibility extends LuceneTestCase { // Uncomment these cases & run them on an older Lucene version, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java index 7c4f1a68a65..25fef8ca367 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; +import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; @@ -49,7 +50,7 @@ public class TestDuelingCodecs extends LuceneTestCase { public void setUp() throws Exception { super.setUp(); - // for now its SimpleText vs Lucene42(random postings format) + // for now its SimpleText vs Lucene45(random postings format) // as this gives the best overall coverage. when we have more // codecs we should probably pick 2 from Codec.availableCodecs() @@ -134,6 +135,11 @@ public class TestDuelingCodecs extends LuceneTestCase { for (String trash : split) { document.add(new SortedSetDocValuesField("sortedset", new BytesRef(trash))); } + // add a numeric dv field sometimes + document.removeFields("sparsenumeric"); + if (random.nextInt(4) == 2) { + document.add(new NumericDocValuesField("sparsenumeric", random.nextInt())); + } writer.addDocument(document); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java b/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java index 78c8974c555..8944dd66b67 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java @@ -26,6 +26,7 @@ import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -321,4 +322,52 @@ public class TestMultiDocValues extends LuceneTestCase { ir2.close(); dir.close(); } + + public void testDocsWithField() throws Exception { + assumeTrue("codec does not support docsWithField", defaultCodecSupportsDocsWithField()); + Directory dir = newDirectory(); + + IndexWriterConfig iwc = newIndexWriterConfig(random(), TEST_VERSION_CURRENT, null); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + int numDocs = atLeast(500); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextInt(4) >= 0) { + doc.add(new NumericDocValuesField("numbers", random().nextLong())); + } + doc.add(new NumericDocValuesField("numbersAlways", random().nextLong())); + iw.addDocument(doc); + if (random().nextInt(17) == 0) { + iw.commit(); + } + } + DirectoryReader ir = iw.getReader(); + iw.forceMerge(1); + DirectoryReader ir2 = iw.getReader(); + AtomicReader merged = getOnlySegmentReader(ir2); + iw.close(); + + Bits multi = MultiDocValues.getDocsWithField(ir, "numbers"); + Bits single = merged.getDocsWithField("numbers"); + if (multi == null) { + assertNull(single); + } else { + assertEquals(single.length(), multi.length()); + for (int i = 0; i < numDocs; i++) { + assertEquals(single.get(i), multi.get(i)); + } + } + + multi = MultiDocValues.getDocsWithField(ir, "numbersAlways"); + single = merged.getDocsWithField("numbersAlways"); + assertEquals(single.length(), multi.length()); + for (int i = 0; i < numDocs; i++) { + assertEquals(single.get(i), multi.get(i)); + } + ir.close(); + ir2.close(); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java b/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java index 2c498ae7211..c36367277bb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java @@ -480,7 +480,7 @@ public class TestFieldCache extends LuceneTestCase { } catch (IllegalStateException expected) {} Bits bits = FieldCache.DEFAULT.getDocsWithField(ar, "binary"); - assertTrue(bits instanceof Bits.MatchAllBits); + assertTrue(bits.get(0)); // Sorted type: can be retrieved via getTerms(), getTermsIndex(), getDocTermOrds() try { @@ -510,7 +510,7 @@ public class TestFieldCache extends LuceneTestCase { assertEquals(1, sortedSet.getValueCount()); bits = FieldCache.DEFAULT.getDocsWithField(ar, "sorted"); - assertTrue(bits instanceof Bits.MatchAllBits); + assertTrue(bits.get(0)); // Numeric type: can be retrieved via getInts() and so on Ints numeric = FieldCache.DEFAULT.getInts(ar, "numeric", false); @@ -537,7 +537,7 @@ public class TestFieldCache extends LuceneTestCase { } catch (IllegalStateException expected) {} bits = FieldCache.DEFAULT.getDocsWithField(ar, "numeric"); - assertTrue(bits instanceof Bits.MatchAllBits); + assertTrue(bits.get(0)); // SortedSet type: can be retrieved via getDocTermOrds() if (defaultCodecSupportsSortedSet()) { @@ -569,7 +569,7 @@ public class TestFieldCache extends LuceneTestCase { assertEquals(2, sortedSet.getValueCount()); bits = FieldCache.DEFAULT.getDocsWithField(ar, "sortedset"); - assertTrue(bits instanceof Bits.MatchAllBits); + assertTrue(bits.get(0)); } ir.close(); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java b/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java index 49f954d319f..7a24a11845f 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java @@ -31,10 +31,12 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; /** Tests basic sorting on docvalues fields. * These are mostly like TestSort's tests, except each test * indexes the field up-front as docvalues, and checks no fieldcaches were made */ +@SuppressCodecs({"Lucene40", "Lucene41", "Lucene42"}) // avoid codecs that don't support "missing" public class TestSortDocValues extends LuceneTestCase { @Override @@ -291,6 +293,70 @@ public class TestSortDocValues extends LuceneTestCase { dir.close(); } + /** Tests sorting on type int with a missing value */ + public void testIntMissing() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + writer.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("value", -1)); + doc.add(newStringField("value", "-1", Field.Store.YES)); + writer.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("value", 4)); + doc.add(newStringField("value", "4", Field.Store.YES)); + writer.addDocument(doc); + IndexReader ir = writer.getReader(); + writer.close(); + + IndexSearcher searcher = newSearcher(ir); + Sort sort = new Sort(new SortField("value", SortField.Type.INT)); + + TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); + assertEquals(3, td.totalHits); + // null is treated as a 0 + assertEquals("-1", searcher.doc(td.scoreDocs[0].doc).get("value")); + assertNull(searcher.doc(td.scoreDocs[1].doc).get("value")); + assertEquals("4", searcher.doc(td.scoreDocs[2].doc).get("value")); + + ir.close(); + dir.close(); + } + + /** Tests sorting on type int, specifying the missing value should be treated as Integer.MAX_VALUE */ + public void testIntMissingLast() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + writer.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("value", -1)); + doc.add(newStringField("value", "-1", Field.Store.YES)); + writer.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("value", 4)); + doc.add(newStringField("value", "4", Field.Store.YES)); + writer.addDocument(doc); + IndexReader ir = writer.getReader(); + writer.close(); + + IndexSearcher searcher = newSearcher(ir); + SortField sortField = new SortField("value", SortField.Type.INT); + sortField.setMissingValue(Integer.MAX_VALUE); + Sort sort = new Sort(sortField); + + TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); + assertEquals(3, td.totalHits); + // null is treated as a Integer.MAX_VALUE + assertEquals("-1", searcher.doc(td.scoreDocs[0].doc).get("value")); + assertEquals("4", searcher.doc(td.scoreDocs[1].doc).get("value")); + assertNull(searcher.doc(td.scoreDocs[2].doc).get("value")); + + ir.close(); + dir.close(); + } + /** Tests sorting on type long */ public void testLong() throws IOException { Directory dir = newDirectory(); @@ -359,6 +425,70 @@ public class TestSortDocValues extends LuceneTestCase { dir.close(); } + /** Tests sorting on type long with a missing value */ + public void testLongMissing() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + writer.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("value", -1)); + doc.add(newStringField("value", "-1", Field.Store.YES)); + writer.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("value", 4)); + doc.add(newStringField("value", "4", Field.Store.YES)); + writer.addDocument(doc); + IndexReader ir = writer.getReader(); + writer.close(); + + IndexSearcher searcher = newSearcher(ir); + Sort sort = new Sort(new SortField("value", SortField.Type.LONG)); + + TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); + assertEquals(3, td.totalHits); + // null is treated as 0 + assertEquals("-1", searcher.doc(td.scoreDocs[0].doc).get("value")); + assertNull(searcher.doc(td.scoreDocs[1].doc).get("value")); + assertEquals("4", searcher.doc(td.scoreDocs[2].doc).get("value")); + + ir.close(); + dir.close(); + } + + /** Tests sorting on type long, specifying the missing value should be treated as Long.MAX_VALUE */ + public void testLongMissingLast() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + writer.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("value", -1)); + doc.add(newStringField("value", "-1", Field.Store.YES)); + writer.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("value", 4)); + doc.add(newStringField("value", "4", Field.Store.YES)); + writer.addDocument(doc); + IndexReader ir = writer.getReader(); + writer.close(); + + IndexSearcher searcher = newSearcher(ir); + SortField sortField = new SortField("value", SortField.Type.LONG); + sortField.setMissingValue(Long.MAX_VALUE); + Sort sort = new Sort(sortField); + + TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); + assertEquals(3, td.totalHits); + // null is treated as Long.MAX_VALUE + assertEquals("-1", searcher.doc(td.scoreDocs[0].doc).get("value")); + assertEquals("4", searcher.doc(td.scoreDocs[1].doc).get("value")); + assertNull(searcher.doc(td.scoreDocs[2].doc).get("value")); + + ir.close(); + dir.close(); + } + /** Tests sorting on type float */ public void testFloat() throws IOException { Directory dir = newDirectory(); @@ -427,6 +557,70 @@ public class TestSortDocValues extends LuceneTestCase { dir.close(); } + /** Tests sorting on type float with a missing value */ + public void testFloatMissing() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + writer.addDocument(doc); + doc = new Document(); + doc.add(new FloatDocValuesField("value", -1.3F)); + doc.add(newStringField("value", "-1.3", Field.Store.YES)); + writer.addDocument(doc); + doc = new Document(); + doc.add(new FloatDocValuesField("value", 4.2F)); + doc.add(newStringField("value", "4.2", Field.Store.YES)); + writer.addDocument(doc); + IndexReader ir = writer.getReader(); + writer.close(); + + IndexSearcher searcher = newSearcher(ir); + Sort sort = new Sort(new SortField("value", SortField.Type.FLOAT)); + + TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); + assertEquals(3, td.totalHits); + // null is treated as 0 + assertEquals("-1.3", searcher.doc(td.scoreDocs[0].doc).get("value")); + assertNull(searcher.doc(td.scoreDocs[1].doc).get("value")); + assertEquals("4.2", searcher.doc(td.scoreDocs[2].doc).get("value")); + + ir.close(); + dir.close(); + } + + /** Tests sorting on type float, specifying the missing value should be treated as Float.MAX_VALUE */ + public void testFloatMissingLast() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + writer.addDocument(doc); + doc = new Document(); + doc.add(new FloatDocValuesField("value", -1.3F)); + doc.add(newStringField("value", "-1.3", Field.Store.YES)); + writer.addDocument(doc); + doc = new Document(); + doc.add(new FloatDocValuesField("value", 4.2F)); + doc.add(newStringField("value", "4.2", Field.Store.YES)); + writer.addDocument(doc); + IndexReader ir = writer.getReader(); + writer.close(); + + IndexSearcher searcher = newSearcher(ir); + SortField sortField = new SortField("value", SortField.Type.FLOAT); + sortField.setMissingValue(Float.MAX_VALUE); + Sort sort = new Sort(sortField); + + TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); + assertEquals(3, td.totalHits); + // null is treated as Float.MAX_VALUE + assertEquals("-1.3", searcher.doc(td.scoreDocs[0].doc).get("value")); + assertEquals("4.2", searcher.doc(td.scoreDocs[1].doc).get("value")); + assertNull(searcher.doc(td.scoreDocs[2].doc).get("value")); + + ir.close(); + dir.close(); + } + /** Tests sorting on type double */ public void testDouble() throws IOException { Directory dir = newDirectory(); @@ -533,4 +727,78 @@ public class TestSortDocValues extends LuceneTestCase { ir.close(); dir.close(); } + + /** Tests sorting on type double with a missing value */ + public void testDoubleMissing() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + writer.addDocument(doc); + doc = new Document(); + doc.add(new DoubleDocValuesField("value", -1.3)); + doc.add(newStringField("value", "-1.3", Field.Store.YES)); + writer.addDocument(doc); + doc = new Document(); + doc.add(new DoubleDocValuesField("value", 4.2333333333333)); + doc.add(newStringField("value", "4.2333333333333", Field.Store.YES)); + writer.addDocument(doc); + doc = new Document(); + doc.add(new DoubleDocValuesField("value", 4.2333333333332)); + doc.add(newStringField("value", "4.2333333333332", Field.Store.YES)); + writer.addDocument(doc); + IndexReader ir = writer.getReader(); + writer.close(); + + IndexSearcher searcher = newSearcher(ir); + Sort sort = new Sort(new SortField("value", SortField.Type.DOUBLE)); + + TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); + assertEquals(4, td.totalHits); + // null treated as a 0 + assertEquals("-1.3", searcher.doc(td.scoreDocs[0].doc).get("value")); + assertNull(searcher.doc(td.scoreDocs[1].doc).get("value")); + assertEquals("4.2333333333332", searcher.doc(td.scoreDocs[2].doc).get("value")); + assertEquals("4.2333333333333", searcher.doc(td.scoreDocs[3].doc).get("value")); + + ir.close(); + dir.close(); + } + + /** Tests sorting on type double, specifying the missing value should be treated as Double.MAX_VALUE */ + public void testDoubleMissingLast() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + writer.addDocument(doc); + doc = new Document(); + doc.add(new DoubleDocValuesField("value", -1.3)); + doc.add(newStringField("value", "-1.3", Field.Store.YES)); + writer.addDocument(doc); + doc = new Document(); + doc.add(new DoubleDocValuesField("value", 4.2333333333333)); + doc.add(newStringField("value", "4.2333333333333", Field.Store.YES)); + writer.addDocument(doc); + doc = new Document(); + doc.add(new DoubleDocValuesField("value", 4.2333333333332)); + doc.add(newStringField("value", "4.2333333333332", Field.Store.YES)); + writer.addDocument(doc); + IndexReader ir = writer.getReader(); + writer.close(); + + IndexSearcher searcher = newSearcher(ir); + SortField sortField = new SortField("value", SortField.Type.DOUBLE); + sortField.setMissingValue(Double.MAX_VALUE); + Sort sort = new Sort(sortField); + + TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort); + assertEquals(4, td.totalHits); + // null treated as Double.MAX_VALUE + assertEquals("-1.3", searcher.doc(td.scoreDocs[0].doc).get("value")); + assertEquals("4.2333333333332", searcher.doc(td.scoreDocs[1].doc).get("value")); + assertEquals("4.2333333333333", searcher.doc(td.scoreDocs[2].doc).get("value")); + assertNull(searcher.doc(td.scoreDocs[3].doc).get("value")); + + ir.close(); + dir.close(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestNamedSPILoader.java b/lucene/core/src/test/org/apache/lucene/util/TestNamedSPILoader.java index 4f2c51edf00..1724cc26ec0 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestNamedSPILoader.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestNamedSPILoader.java @@ -25,8 +25,8 @@ import org.apache.lucene.codecs.Codec; // enough to test the basics via Codec public class TestNamedSPILoader extends LuceneTestCase { public void testLookup() { - Codec codec = Codec.forName("Lucene42"); - assertEquals("Lucene42", codec.getName()); + Codec codec = Codec.forName("Lucene45"); + assertEquals("Lucene45", codec.getName()); } // we want an exception if its not found. @@ -39,6 +39,6 @@ public class TestNamedSPILoader extends LuceneTestCase { public void testAvailableServices() { Set codecs = Codec.availableCodecs(); - assertTrue(codecs.contains("Lucene42")); + assertTrue(codecs.contains("Lucene45")); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java index e08899c5d01..79a78e0acfd 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java @@ -68,7 +68,9 @@ public class Facet42DocValuesConsumer extends DocValuesConsumer { long totBytes = 0; for (BytesRef v : values) { - totBytes += v.length; + if (v != null) { + totBytes += v.length; + } } if (totBytes > Integer.MAX_VALUE) { @@ -78,7 +80,9 @@ public class Facet42DocValuesConsumer extends DocValuesConsumer { out.writeVInt((int) totBytes); for (BytesRef v : values) { - out.writeBytes(v.bytes, v.offset, v.length); + if (v != null) { + out.writeBytes(v.bytes, v.offset, v.length); + } } PackedInts.Writer w = PackedInts.getWriter(out, maxDoc+1, PackedInts.bitsRequired(totBytes+1), acceptableOverheadRatio); @@ -86,7 +90,9 @@ public class Facet42DocValuesConsumer extends DocValuesConsumer { int address = 0; for(BytesRef v : values) { w.add(address); - address += v.length; + if (v != null) { + address += v.length; + } } w.add(address); w.finish(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java index ad7cb27caca..80daa1cc588 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java @@ -31,15 +31,18 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; class Facet42DocValuesProducer extends DocValuesProducer { private final Map fields = new HashMap(); + private final int maxDoc; Facet42DocValuesProducer(SegmentReadState state) throws IOException { String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Facet42DocValuesFormat.EXTENSION); IndexInput in = state.directory.openInput(fileName, state.context); + this.maxDoc = state.segmentInfo.getDocCount(); boolean success = false; try { CodecUtil.checkHeader(in, Facet42DocValuesFormat.CODEC, @@ -80,6 +83,11 @@ class Facet42DocValuesProducer extends DocValuesProducer { throw new UnsupportedOperationException("FacetsDocValues only implements binary"); } + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + return new Bits.MatchAllBits(maxDoc); // TODO: have codec impl this? + } + @Override public void close() throws IOException { } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42Codec.java b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/Facet45Codec.java similarity index 85% rename from lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42Codec.java rename to lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/Facet45Codec.java index 4987062afc0..9ff508429dc 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42Codec.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/Facet45Codec.java @@ -1,4 +1,4 @@ -package org.apache.lucene.facet.codecs.facet42; +package org.apache.lucene.facet.codecs.facet45; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,12 +21,13 @@ import java.util.HashSet; import java.util.Set; import org.apache.lucene.codecs.DocValuesFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; +import org.apache.lucene.facet.codecs.facet42.Facet42DocValuesFormat; import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.FacetIndexingParams; /** - * Same as {@link Lucene42Codec} except it uses {@link Facet42DocValuesFormat} + * Same as {@link Lucene45Codec} except it uses {@link Facet42DocValuesFormat} * for facet fields (faster-but-more-RAM-consuming doc values). * *

    @@ -42,15 +43,14 @@ import org.apache.lucene.facet.params.FacetIndexingParams; * * @lucene.experimental */ -public class Facet42Codec extends Lucene42Codec { +public class Facet45Codec extends Lucene45Codec { private final Set facetFields; private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName("Facet42"); - private final DocValuesFormat lucene42DVFormat = DocValuesFormat.forName("Lucene42"); // must have that for SPI purposes /** Default constructor, uses {@link FacetIndexingParams#DEFAULT}. */ - public Facet42Codec() { + public Facet45Codec() { this(FacetIndexingParams.DEFAULT); } @@ -59,7 +59,7 @@ public class Facet42Codec extends Lucene42Codec { * {@link DocValuesFormat} for the fields that are returned by * {@link FacetIndexingParams#getAllCategoryListParams()}. */ - public Facet42Codec(FacetIndexingParams fip) { + public Facet45Codec(FacetIndexingParams fip) { if (fip.getPartitionSize() != Integer.MAX_VALUE) { throw new IllegalArgumentException("this Codec does not support partitions"); } @@ -74,8 +74,7 @@ public class Facet42Codec extends Lucene42Codec { if (facetFields.contains(field)) { return facetsDVFormat; } else { - return lucene42DVFormat; + return super.getDocValuesFormatForField(field); } } - } diff --git a/solr/core/src/test-files/solr/collection1/conf/bad-schema-docValues-not-required-no-default.xml b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/package.html similarity index 65% rename from solr/core/src/test-files/solr/collection1/conf/bad-schema-docValues-not-required-no-default.xml rename to lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/package.html index deadd9ac68b..c752b963484 100644 --- a/solr/core/src/test-files/solr/collection1/conf/bad-schema-docValues-not-required-no-default.xml +++ b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/package.html @@ -1,4 +1,4 @@ - + - - - - - - - - - - - - - id - id - - + + +Codec + DocValuesFormat that are optimized for facets. + + diff --git a/lucene/facet/src/java/org/apache/lucene/facet/util/FacetsPayloadMigrationReader.java b/lucene/facet/src/java/org/apache/lucene/facet/util/FacetsPayloadMigrationReader.java index ec4d2c9d638..e3d998d5159 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/util/FacetsPayloadMigrationReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/util/FacetsPayloadMigrationReader.java @@ -40,6 +40,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; /** @@ -222,6 +223,18 @@ public class FacetsPayloadMigrationReader extends FilterAtomicReader { } } + @Override + public Bits getDocsWithField(String field) throws IOException { + Term term = fieldTerms.get(field); + if (term == null) { + return super.getDocsWithField(field); + } else { + // we shouldn't return null, even if the term does not exist or has no + // payloads, since we already marked the field as having DocValues. + return new Bits.MatchAllBits(maxDoc()); + } + } + @Override public FieldInfos getFieldInfos() { FieldInfos innerInfos = super.getFieldInfos(); diff --git a/lucene/facet/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/facet/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 343b4cdb07e..d6e8c740686 100644 --- a/lucene/facet/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/facet/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.facet.codecs.facet42.Facet42Codec +org.apache.lucene.facet.codecs.facet45.Facet45Codec diff --git a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java index 31e79edbf14..6bf9fe6e009 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java @@ -3,7 +3,7 @@ package org.apache.lucene.facet; import java.util.Random; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.facet.codecs.facet42.Facet42Codec; +import org.apache.lucene.facet.codecs.facet45.Facet45Codec; import org.apache.lucene.facet.encoding.DGapIntEncoder; import org.apache.lucene.facet.encoding.DGapVInt8IntEncoder; import org.apache.lucene.facet.encoding.EightFlagsIntEncoder; @@ -53,7 +53,7 @@ public abstract class FacetTestCase extends LuceneTestCase { public static void beforeClassFacetTestCase() throws Exception { if (random().nextDouble() < 0.3) { savedDefault = Codec.getDefault(); // save to restore later - Codec.setDefault(new Facet42Codec()); + Codec.setDefault(new Facet45Codec()); } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java index 0cf73fcdb23..d0e65772160 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java @@ -31,7 +31,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestUtils; -import org.apache.lucene.facet.codecs.facet42.Facet42Codec; +import org.apache.lucene.facet.codecs.facet45.Facet45Codec; import org.apache.lucene.facet.index.FacetFields; import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.FacetIndexingParams; @@ -260,7 +260,7 @@ public class TestDemoFacets extends FacetTestCase { Directory dir = newDirectory(); Directory taxoDir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); - iwc.setCodec(new Facet42Codec()); + iwc.setCodec(new Facet45Codec()); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index a207b970de3..a2b1bd45b2d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -422,6 +422,11 @@ public class WeightedSpanTermExtractor { public NumericDocValues getNormValues(String field) throws IOException { return super.getNormValues(FIELD_NAME); } + + @Override + public Bits getDocsWithField(String field) throws IOException { + return super.getDocsWithField(FIELD_NAME); + } } /** diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 8107aab3b68..3a0c6bbe62f 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -756,6 +756,11 @@ public class MemoryIndex { return null; } + @Override + public Bits getDocsWithField(String field) throws IOException { + return null; + } + private class MemoryFields extends Fields { @Override public Iterator iterator() { diff --git a/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java b/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java index f7cce128692..469357dab63 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java +++ b/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java @@ -221,6 +221,27 @@ public class SortingAtomicReader extends FilterAtomicReader { } } + private static class SortingBits implements Bits { + + private final Bits in; + private final Sorter.DocMap docMap; + + public SortingBits(final Bits in, Sorter.DocMap docMap) { + this.in = in; + this.docMap = docMap; + } + + @Override + public boolean get(int index) { + return in.get(docMap.newToOld(index)); + } + + @Override + public int length() { + return in.length(); + } + } + private static class SortingSortedDocValues extends SortedDocValues { private final SortedDocValues in; @@ -743,20 +764,9 @@ public class SortingAtomicReader extends FilterAtomicReader { final Bits inLiveDocs = in.getLiveDocs(); if (inLiveDocs == null) { return null; + } else { + return new SortingBits(inLiveDocs, docMap); } - return new Bits() { - - @Override - public boolean get(int index) { - return inLiveDocs.get(docMap.newToOld(index)); - } - - @Override - public int length() { - return inLiveDocs.length(); - } - - }; } @Override @@ -796,6 +806,16 @@ public class SortingAtomicReader extends FilterAtomicReader { } } + @Override + public Bits getDocsWithField(String field) throws IOException { + Bits bits = in.getDocsWithField(field); + if (bits == null || bits instanceof Bits.MatchAllBits || bits instanceof Bits.MatchNoBits) { + return bits; + } else { + return new SortingBits(bits, docMap); + } + } + @Override public Fields getTermVectors(final int docID) throws IOException { return in.getTermVectors(docMap.newToOld(docID)); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/BytesRefFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/BytesRefFieldSource.java index edbc37c8c76..871c94cc50e 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/BytesRefFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/BytesRefFieldSource.java @@ -45,12 +45,13 @@ public class BytesRefFieldSource extends FieldCacheSource { // To be sorted or not to be sorted, that is the question // TODO: do it cleaner? if (fieldInfo != null && fieldInfo.getDocValuesType() == DocValuesType.BINARY) { + final Bits docsWithField = FieldCache.DEFAULT.getDocsWithField(readerContext.reader(), field); final BinaryDocValues binaryValues = FieldCache.DEFAULT.getTerms(readerContext.reader(), field); return new FunctionValues() { @Override public boolean exists(int doc) { - return true; // doc values are dense + return docsWithField.get(doc); } @Override diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleFieldSource.java index 8da0ad64224..73c5d497548 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleFieldSource.java @@ -68,7 +68,7 @@ public class DoubleFieldSource extends FieldCacheSource { @Override public boolean exists(int doc) { - return valid.get(doc); + return arr.get(doc) != 0 || valid.get(doc); } @Override @@ -142,7 +142,7 @@ public class DoubleFieldSource extends FieldCacheSource { @Override public void fillValue(int doc) { mval.value = arr.get(doc); - mval.exists = valid.get(doc); + mval.exists = mval.value != 0 || valid.get(doc); } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatFieldSource.java index da36cfc1360..c81526365c4 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatFieldSource.java @@ -72,7 +72,7 @@ public class FloatFieldSource extends FieldCacheSource { @Override public boolean exists(int doc) { - return valid.get(doc); + return arr.get(doc) != 0 || valid.get(doc); } @Override @@ -88,7 +88,7 @@ public class FloatFieldSource extends FieldCacheSource { @Override public void fillValue(int doc) { mval.value = arr.get(doc); - mval.exists = valid.get(doc); + mval.exists = mval.value != 0 || valid.get(doc); } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IntFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IntFieldSource.java index 296432d6f9a..768d9d77e5a 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IntFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IntFieldSource.java @@ -95,7 +95,7 @@ public class IntFieldSource extends FieldCacheSource { @Override public boolean exists(int doc) { - return valid.get(doc); + return arr.get(doc) != 0 || valid.get(doc); } @Override @@ -150,7 +150,7 @@ public class IntFieldSource extends FieldCacheSource { @Override public void fillValue(int doc) { mval.value = arr.get(doc); - mval.exists = valid.get(doc); + mval.exists = mval.value != 0 || valid.get(doc); } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/LongFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/LongFieldSource.java index 597efe89e97..d1718313d8b 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/LongFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/LongFieldSource.java @@ -81,7 +81,7 @@ public class LongFieldSource extends FieldCacheSource { @Override public boolean exists(int doc) { - return valid.get(doc); + return arr.get(doc) != 0 || valid.get(doc); } @Override @@ -141,7 +141,7 @@ public class LongFieldSource extends FieldCacheSource { @Override public void fillValue(int doc) { mval.value = arr.get(doc); - mval.exists = valid.get(doc); + mval.exists = mval.value != 0 || valid.get(doc); } }; } diff --git a/lucene/site/xsl/index.xsl b/lucene/site/xsl/index.xsl index 9a7235b395b..842f7be3d9d 100644 --- a/lucene/site/xsl/index.xsl +++ b/lucene/site/xsl/index.xsl @@ -75,7 +75,7 @@

  • System Requirements: Minimum and supported Java versions.
  • Migration Guide: What changed in Lucene 4; how to migrate code from Lucene 3.x.
  • JRE Version Migration: Information about upgrading between major JRE versions.
  • -
  • File Formats: Guide to the supported index format used by Lucene. This can be customized by using an alternate codec.
  • +
  • File Formats: Guide to the supported index format used by Lucene. This can be customized by using an alternate codec.
  • Search and Scoring in Lucene: Introduction to how Lucene scores documents.
  • Classic Scoring Formula: Formula of Lucene's classic Vector Space implementation. (look here for other models)
  • Classic QueryParser Syntax: Overview of the Classic QueryParser's syntax and features.
  • diff --git a/lucene/spatial/src/java/org/apache/lucene/spatial/bbox/BBoxSimilarityValueSource.java b/lucene/spatial/src/java/org/apache/lucene/spatial/bbox/BBoxSimilarityValueSource.java index f501f150ddb..2a3b2a7c539 100644 --- a/lucene/spatial/src/java/org/apache/lucene/spatial/bbox/BBoxSimilarityValueSource.java +++ b/lucene/spatial/src/java/org/apache/lucene/spatial/bbox/BBoxSimilarityValueSource.java @@ -78,10 +78,12 @@ public class BBoxSimilarityValueSource extends ValueSource { @Override public float floatVal(int doc) { + double minXVal = minX.get(doc); + double maxXVal = maxX.get(doc); // make sure it has minX and area - if (validMinX.get(doc) && validMaxX.get(doc)) { + if ((minXVal != 0 || validMinX.get(doc)) && (maxXVal != 0 || validMaxX.get(doc))) { rect.reset( - minX.get(doc), maxX.get(doc), + minXVal, maxXVal, minY.get(doc), maxY.get(doc)); return (float) similarity.score(rect, null); } else { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java index 8c8ad170c2c..60f68ead5ec 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java @@ -34,7 +34,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -161,7 +161,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { * codec to use. */ protected IndexWriterConfig getIndexWriterConfig(Version matchVersion, Analyzer indexAnalyzer) { IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer); - iwc.setCodec(new Lucene42Codec()); + iwc.setCodec(new Lucene45Codec()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); return iwc; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/MissingOrdRemapper.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/MissingOrdRemapper.java new file mode 100644 index 00000000000..770cfbda68c --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/MissingOrdRemapper.java @@ -0,0 +1,122 @@ +package org.apache.lucene.codecs; + +import java.util.Iterator; + +import org.apache.lucene.util.BytesRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * a utility class to write missing values for SORTED as if they were the empty string + * (to simulate pre-Lucene4.5 dv behavior for testing old codecs) + */ +public class MissingOrdRemapper { + + /** insert an empty byte[] to the front of this iterable */ + public static Iterable insertEmptyValue(final Iterable iterable) { + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + boolean seenEmpty = false; + Iterator in = iterable.iterator(); + + @Override + public boolean hasNext() { + return !seenEmpty || in.hasNext(); + } + + @Override + public BytesRef next() { + if (!seenEmpty) { + seenEmpty = true; + return new BytesRef(); + } else { + return in.next(); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + }; + } + + /** remaps ord -1 to ord 0 on this iterable. */ + public static Iterable mapMissingToOrd0(final Iterable iterable) { + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + Iterator in = iterable.iterator(); + + @Override + public boolean hasNext() { + return in.hasNext(); + } + + @Override + public Number next() { + Number n = in.next(); + if (n.longValue() == -1) { + return 0; + } else { + return n; + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + }; + } + + /** remaps every ord+1 on this iterable */ + public static Iterable mapAllOrds(final Iterable iterable) { + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + Iterator in = iterable.iterator(); + + @Override + public boolean hasNext() { + return in.hasNext(); + } + + @Override + public Number next() { + Number n = in.next(); + return n.longValue()+1; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + }; + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java index 19cad74151f..9591997229d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java @@ -23,10 +23,10 @@ import org.apache.lucene.codecs.NormsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; /** - * Acts like {@link Lucene42Codec} but with additional asserts. + * Acts like {@link Lucene45Codec} but with additional asserts. */ public final class AssertingCodec extends FilterCodec { @@ -37,7 +37,7 @@ public final class AssertingCodec extends FilterCodec { private final NormsFormat norms = new AssertingNormsFormat(); public AssertingCodec() { - super("Asserting", new Lucene42Codec()); + super("Asserting", new Lucene45Codec()); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java index 998bea7835f..f3525a43189 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java @@ -24,7 +24,7 @@ import java.util.NoSuchElementException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat; import org.apache.lucene.index.AssertingAtomicReader; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.FieldInfo; @@ -33,15 +33,16 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.OpenBitSet; /** - * Just like {@link Lucene42DocValuesFormat} but with additional asserts. + * Just like {@link Lucene45DocValuesFormat} but with additional asserts. */ public class AssertingDocValuesFormat extends DocValuesFormat { - private final DocValuesFormat in = new Lucene42DocValuesFormat(); + private final DocValuesFormat in = new Lucene45DocValuesFormat(); public AssertingDocValuesFormat() { super("Asserting"); @@ -75,11 +76,10 @@ public class AssertingDocValuesFormat extends DocValuesFormat { public void addNumericField(FieldInfo field, Iterable values) throws IOException { int count = 0; for (Number v : values) { - assert v != null; count++; } assert count == maxDoc; - checkIterator(values.iterator(), maxDoc); + checkIterator(values.iterator(), maxDoc, true); in.addNumericField(field, values); } @@ -87,12 +87,11 @@ public class AssertingDocValuesFormat extends DocValuesFormat { public void addBinaryField(FieldInfo field, Iterable values) throws IOException { int count = 0; for (BytesRef b : values) { - assert b != null; - assert b.isValid(); + assert b == null || b.isValid(); count++; } assert count == maxDoc; - checkIterator(values.iterator(), maxDoc); + checkIterator(values.iterator(), maxDoc, true); in.addBinaryField(field, values); } @@ -117,15 +116,17 @@ public class AssertingDocValuesFormat extends DocValuesFormat { for (Number v : docToOrd) { assert v != null; int ord = v.intValue(); - assert ord >= 0 && ord < valueCount; - seenOrds.set(ord); + assert ord >= -1 && ord < valueCount; + if (ord >= 0) { + seenOrds.set(ord); + } count++; } assert count == maxDoc; assert seenOrds.cardinality() == valueCount; - checkIterator(values.iterator(), valueCount); - checkIterator(docToOrd.iterator(), maxDoc); + checkIterator(values.iterator(), valueCount, false); + checkIterator(docToOrd.iterator(), maxDoc, false); in.addSortedField(field, values, docToOrd); } @@ -169,33 +170,11 @@ public class AssertingDocValuesFormat extends DocValuesFormat { assert docCount == maxDoc; assert seenOrds.cardinality() == valueCount; - checkIterator(values.iterator(), valueCount); - checkIterator(docToOrdCount.iterator(), maxDoc); - checkIterator(ords.iterator(), ordCount); + checkIterator(values.iterator(), valueCount, false); + checkIterator(docToOrdCount.iterator(), maxDoc, false); + checkIterator(ords.iterator(), ordCount, false); in.addSortedSetField(field, values, docToOrdCount, ords); } - - private void checkIterator(Iterator iterator, long expectedSize) { - for (long i = 0; i < expectedSize; i++) { - boolean hasNext = iterator.hasNext(); - assert hasNext; - T v = iterator.next(); - assert v != null; - try { - iterator.remove(); - throw new AssertionError("broken iterator (supports remove): " + iterator); - } catch (UnsupportedOperationException expected) { - // ok - } - } - assert !iterator.hasNext(); - try { - iterator.next(); - throw new AssertionError("broken iterator (allows next() when hasNext==false) " + iterator); - } catch (NoSuchElementException expected) { - // ok - } - } @Override public void close() throws IOException { @@ -203,6 +182,70 @@ public class AssertingDocValuesFormat extends DocValuesFormat { } } + static class AssertingNormsConsumer extends DocValuesConsumer { + private final DocValuesConsumer in; + private final int maxDoc; + + AssertingNormsConsumer(DocValuesConsumer in, int maxDoc) { + this.in = in; + this.maxDoc = maxDoc; + } + + @Override + public void addNumericField(FieldInfo field, Iterable values) throws IOException { + int count = 0; + for (Number v : values) { + assert v != null; + count++; + } + assert count == maxDoc; + checkIterator(values.iterator(), maxDoc, false); + in.addNumericField(field, values); + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public void addBinaryField(FieldInfo field, Iterable values) throws IOException { + throw new IllegalStateException(); + } + + @Override + public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + throw new IllegalStateException(); + } + + @Override + public void addSortedSetField(FieldInfo field, Iterable values, Iterable docToOrdCount, Iterable ords) throws IOException { + throw new IllegalStateException(); + } + } + + private static void checkIterator(Iterator iterator, long expectedSize, boolean allowNull) { + for (long i = 0; i < expectedSize; i++) { + boolean hasNext = iterator.hasNext(); + assert hasNext; + T v = iterator.next(); + assert allowNull || v != null; + try { + iterator.remove(); + throw new AssertionError("broken iterator (supports remove): " + iterator); + } catch (UnsupportedOperationException expected) { + // ok + } + } + assert !iterator.hasNext(); + try { + iterator.next(); + throw new AssertionError("broken iterator (allows next() when hasNext==false) " + iterator); + } catch (NoSuchElementException expected) { + // ok + } + } + static class AssertingDocValuesProducer extends DocValuesProducer { private final DocValuesProducer in; private final int maxDoc; @@ -244,6 +287,15 @@ public class AssertingDocValuesFormat extends DocValuesFormat { assert values != null; return new AssertingAtomicReader.AssertingSortedSetDocValues(values, maxDoc); } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + assert field.getDocValuesType() != null; + Bits bits = in.getDocsWithField(field); + assert bits != null; + assert bits.length() == maxDoc; + return new AssertingAtomicReader.AssertingBits(bits); + } @Override public void close() throws IOException { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java index 8b64401b452..b7662674133 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java @@ -22,7 +22,7 @@ import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.NormsFormat; -import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat.AssertingDocValuesConsumer; +import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat.AssertingNormsConsumer; import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat.AssertingDocValuesProducer; import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; import org.apache.lucene.index.SegmentReadState; @@ -38,7 +38,7 @@ public class AssertingNormsFormat extends NormsFormat { public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException { DocValuesConsumer consumer = in.normsConsumer(state); assert consumer != null; - return new AssertingDocValuesConsumer(consumer, state.segmentInfo.getDocCount()); + return new AssertingNormsConsumer(consumer, state.segmentInfo.getDocCount()); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardCodec.java index 644f04becea..15310e09bc4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardCodec.java @@ -23,10 +23,12 @@ import org.apache.lucene.codecs.NormsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; +import org.apache.lucene.codecs.diskdv.DiskNormsFormat; import org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsFormat; import org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat; import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; /** Codec that tries to use as little ram as possible because he spent all his money on beer */ // TODO: better name :) @@ -39,11 +41,11 @@ public class CheapBastardCodec extends FilterCodec { private final StoredFieldsFormat storedFields = new Lucene40StoredFieldsFormat(); private final TermVectorsFormat termVectors = new Lucene40TermVectorsFormat(); // these go to disk for all docvalues/norms datastructures - private final DocValuesFormat docValues = new CheapBastardDocValuesFormat(); - private final NormsFormat norms = new CheapBastardNormsFormat(); + private final DocValuesFormat docValues = new DiskDocValuesFormat(); + private final NormsFormat norms = new DiskNormsFormat(); public CheapBastardCodec() { - super("CheapBastard", new Lucene42Codec()); + super("CheapBastard", new Lucene45Codec()); } public PostingsFormat postingsFormat() { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java deleted file mode 100644 index 07f152cd6ee..00000000000 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java +++ /dev/null @@ -1,74 +0,0 @@ -package org.apache.lucene.codecs.cheapbastard; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.codecs.DocValuesConsumer; -import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.DocValuesFormat; -import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer; -import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.util.BytesRef; - -/** - * DocValues format that keeps everything on disk. - *

    - * Internally there are only 2 field types: - *

      - *
    • BINARY: a big byte[]. - *
    • NUMERIC: packed ints - *
    - * SORTED is encoded as BINARY + NUMERIC - *

    - * NOTE: Don't use this format in production (its not very efficient). - * Most likely you would want some parts in RAM, other parts on disk. - *

    - * @lucene.experimental - */ -public final class CheapBastardDocValuesFormat extends DocValuesFormat { - - public CheapBastardDocValuesFormat() { - super("CheapBastard"); - } - - @Override - public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return new DiskDocValuesConsumer(state, DiskDocValuesFormat.DATA_CODEC, - DiskDocValuesFormat.DATA_EXTENSION, - DiskDocValuesFormat.META_CODEC, - DiskDocValuesFormat.META_EXTENSION) { - // don't ever write an index, we dont want to use RAM :) - @Override - protected void addTermsDict(FieldInfo field, Iterable values) throws IOException { - addBinaryField(field, values); - } - }; - } - - @Override - public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { - return new CheapBastardDocValuesProducer(state, DiskDocValuesFormat.DATA_CODEC, - DiskDocValuesFormat.DATA_EXTENSION, - DiskDocValuesFormat.META_CODEC, - DiskDocValuesFormat.META_EXTENSION); - } -} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java deleted file mode 100644 index 52f36d2ae40..00000000000 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java +++ /dev/null @@ -1,431 +0,0 @@ -package org.apache.lucene.codecs.cheapbastard; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.DELTA_COMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.GCD_COMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.TABLE_COMPRESSED; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer; -import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SortedDocValues; -import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.packed.BlockPackedReader; -import org.apache.lucene.util.packed.MonotonicBlockPackedReader; -import org.apache.lucene.util.packed.PackedInts; - -class CheapBastardDocValuesProducer extends DocValuesProducer { - private final Map numerics; - private final Map ords; - private final Map ordIndexes; - private final Map binaries; - private final IndexInput data; - - CheapBastardDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { - String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); - // read in the entries from the metadata file. - IndexInput in = state.directory.openInput(metaName, state.context); - boolean success = false; - final int version; - try { - version = CodecUtil.checkHeader(in, metaCodec, - DiskDocValuesFormat.VERSION_CURRENT, - DiskDocValuesFormat.VERSION_CURRENT); - numerics = new HashMap(); - ords = new HashMap(); - ordIndexes = new HashMap(); - binaries = new HashMap(); - readFields(in); - - success = true; - } finally { - if (success) { - IOUtils.close(in); - } else { - IOUtils.closeWhileHandlingException(in); - } - } - - success = false; - try { - String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); - data = state.directory.openInput(dataName, state.context); - final int version2 = CodecUtil.checkHeader(data, dataCodec, - DiskDocValuesFormat.VERSION_CURRENT, - DiskDocValuesFormat.VERSION_CURRENT); - if (version != version2) { - throw new CorruptIndexException("Versions mismatch"); - } - - success = true; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(this.data); - } - } - - } - - private void readFields(IndexInput meta) throws IOException { - int fieldNumber = meta.readVInt(); - while (fieldNumber != -1) { - byte type = meta.readByte(); - if (type == DiskDocValuesFormat.NUMERIC) { - numerics.put(fieldNumber, readNumericEntry(meta)); - } else if (type == DiskDocValuesFormat.BINARY) { - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - } else if (type == DiskDocValuesFormat.SORTED) { - // sorted = binary + numeric - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != DiskDocValuesFormat.BINARY) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - NumericEntry n = readNumericEntry(meta); - ords.put(fieldNumber, n); - } else if (type == DiskDocValuesFormat.SORTED_SET) { - // sortedset = binary + numeric + ordIndex - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != DiskDocValuesFormat.BINARY) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - NumericEntry n1 = readNumericEntry(meta); - ords.put(fieldNumber, n1); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - NumericEntry n2 = readNumericEntry(meta); - ordIndexes.put(fieldNumber, n2); - } else { - throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta); - } - fieldNumber = meta.readVInt(); - } - } - - static NumericEntry readNumericEntry(IndexInput meta) throws IOException { - NumericEntry entry = new NumericEntry(); - entry.format = meta.readVInt(); - entry.packedIntsVersion = meta.readVInt(); - entry.offset = meta.readLong(); - entry.count = meta.readVLong(); - entry.blockSize = meta.readVInt(); - switch(entry.format) { - case GCD_COMPRESSED: - entry.minValue = meta.readLong(); - entry.gcd = meta.readLong(); - break; - case TABLE_COMPRESSED: - if (entry.count > Integer.MAX_VALUE) { - throw new CorruptIndexException("Cannot use TABLE_COMPRESSED with more than MAX_VALUE values, input=" + meta); - } - final int uniqueValues = meta.readVInt(); - if (uniqueValues > 256) { - throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + meta); - } - entry.table = new long[uniqueValues]; - for (int i = 0; i < uniqueValues; ++i) { - entry.table[i] = meta.readLong(); - } - break; - case DELTA_COMPRESSED: - break; - default: - throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); - } - return entry; - } - - static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException { - BinaryEntry entry = new BinaryEntry(); - int format = meta.readVInt(); - if (format != DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED && format != DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED) { - throw new CorruptIndexException("Unexpected format for binary entry: " + format + ", input=" + meta); - } - entry.minLength = meta.readVInt(); - entry.maxLength = meta.readVInt(); - entry.count = meta.readVLong(); - entry.offset = meta.readLong(); - if (entry.minLength != entry.maxLength) { - entry.addressesOffset = meta.readLong(); - entry.packedIntsVersion = meta.readVInt(); - entry.blockSize = meta.readVInt(); - } - return entry; - } - - @Override - public NumericDocValues getNumeric(FieldInfo field) throws IOException { - NumericEntry entry = numerics.get(field.number); - return getNumeric(field, entry); - } - - private LongNumericDocValues getNumeric(FieldInfo field, final NumericEntry entry) throws IOException { - final IndexInput data = this.data.clone(); - data.seek(entry.offset); - - switch (entry.format) { - case DELTA_COMPRESSED: - final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - return new LongNumericDocValues() { - @Override - public long get(long id) { - return reader.get(id); - } - }; - case GCD_COMPRESSED: - final long min = entry.minValue; - final long mult = entry.gcd; - final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - return new LongNumericDocValues() { - @Override - public long get(long id) { - return min + mult * quotientReader.get(id); - } - }; - case TABLE_COMPRESSED: - final long[] table = entry.table; - final int bitsRequired = PackedInts.bitsRequired(table.length - 1); - final PackedInts.Reader ords = PackedInts.getDirectReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, (int) entry.count, bitsRequired); - return new LongNumericDocValues() { - @Override - long get(long id) { - return table[(int) ords.get((int) id)]; - } - }; - default: - throw new AssertionError(); - } - } - - @Override - public BinaryDocValues getBinary(FieldInfo field) throws IOException { - BinaryEntry bytes = binaries.get(field.number); - if (bytes.minLength == bytes.maxLength) { - return getFixedBinary(field, bytes); - } else { - return getVariableBinary(field, bytes); - } - } - - private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) { - final IndexInput data = this.data.clone(); - - return new LongBinaryDocValues() { - @Override - public void get(long id, BytesRef result) { - long address = bytes.offset + id * bytes.maxLength; - try { - data.seek(address); - // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) - // assume "they" own the bytes after calling this! - final byte[] buffer = new byte[bytes.maxLength]; - data.readBytes(buffer, 0, buffer.length); - result.bytes = buffer; - result.offset = 0; - result.length = buffer.length; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; - } - - private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final IndexInput data = this.data.clone(); - data.seek(bytes.addressesOffset); - - final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, true); - return new LongBinaryDocValues() { - @Override - public void get(long id, BytesRef result) { - long startAddress = bytes.offset + (id == 0 ? 0 : + addresses.get(id-1)); - long endAddress = bytes.offset + addresses.get(id); - int length = (int) (endAddress - startAddress); - try { - data.seek(startAddress); - // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) - // assume "they" own the bytes after calling this! - final byte[] buffer = new byte[length]; - data.readBytes(buffer, 0, buffer.length); - result.bytes = buffer; - result.offset = 0; - result.length = length; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; - } - - @Override - public SortedDocValues getSorted(FieldInfo field) throws IOException { - final int valueCount = (int) binaries.get(field.number).count; - final BinaryDocValues binary = getBinary(field); - final NumericDocValues ordinals = getNumeric(field, ords.get(field.number)); - return new SortedDocValues() { - - @Override - public int getOrd(int docID) { - return (int) ordinals.get(docID); - } - - @Override - public void lookupOrd(int ord, BytesRef result) { - binary.get(ord, result); - } - - @Override - public int getValueCount() { - return valueCount; - } - }; - } - - @Override - public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - final long valueCount = binaries.get(field.number).count; - final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); - final LongNumericDocValues ordinals = getNumeric(field, ords.get(field.number)); - NumericEntry entry = ordIndexes.get(field.number); - IndexInput data = this.data.clone(); - data.seek(entry.offset); - final MonotonicBlockPackedReader ordIndex = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - - return new SortedSetDocValues() { - long offset; - long endOffset; - - @Override - public long nextOrd() { - if (offset == endOffset) { - return NO_MORE_ORDS; - } else { - long ord = ordinals.get(offset); - offset++; - return ord; - } - } - - @Override - public void setDocument(int docID) { - offset = (docID == 0 ? 0 : ordIndex.get(docID-1)); - endOffset = ordIndex.get(docID); - } - - @Override - public void lookupOrd(long ord, BytesRef result) { - binary.get(ord, result); - } - - @Override - public long getValueCount() { - return valueCount; - } - }; - } - - @Override - public void close() throws IOException { - data.close(); - } - - static class NumericEntry { - long offset; - - int format; - int packedIntsVersion; - long count; - int blockSize; - - long minValue; - long gcd; - long table[]; - } - - static class BinaryEntry { - long offset; - - long count; - int minLength; - int maxLength; - long addressesOffset; - int packedIntsVersion; - int blockSize; - } - - // internally we compose complex dv (sorted/sortedset) from other ones - static abstract class LongNumericDocValues extends NumericDocValues { - @Override - public final long get(int docID) { - return get((long) docID); - } - - abstract long get(long id); - } - - static abstract class LongBinaryDocValues extends BinaryDocValues { - @Override - public final void get(int docID, BytesRef result) { - get((long)docID, result); - } - - abstract void get(long id, BytesRef Result); - } -} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java index 5fa95c4c9e0..bcfd9361e37 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java @@ -23,13 +23,13 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.compressing.dummy.DummyCompressingCodec; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import com.carrotsearch.randomizedtesting.generators.RandomInts; /** * A codec that uses {@link CompressingStoredFieldsFormat} for its stored - * fields and delegates to {@link Lucene42Codec} for everything else. + * fields and delegates to {@link Lucene45Codec} for everything else. */ public abstract class CompressingCodec extends FilterCodec { @@ -73,7 +73,7 @@ public abstract class CompressingCodec extends FilterCodec { * Creates a compressing codec with a given segment suffix */ public CompressingCodec(String name, String segmentSuffix, CompressionMode compressionMode, int chunkSize) { - super(name, new Lucene42Codec()); + super(name, new Lucene45Codec()); this.storedFieldsFormat = new CompressingStoredFieldsFormat(name, segmentSuffix, compressionMode, chunkSize); this.termVectorsFormat = new CompressingTermVectorsFormat(name, segmentSuffix, compressionMode, chunkSize); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java index 24f41ab2995..f973648aaeb 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java @@ -1,8 +1,6 @@ package org.apache.lucene.codecs.compressing; -import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.NormsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; import org.apache.lucene.util.packed.PackedInts; @@ -42,9 +40,4 @@ public class FastCompressingCodec extends CompressingCodec { public NormsFormat normsFormat() { return new Lucene42NormsFormat(PackedInts.FAST); } - - @Override - public DocValuesFormat docValuesFormat() { - return new Lucene42DocValuesFormat(PackedInts.FAST); - } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java index 7c6ba48f91d..d7c0451bcba 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java @@ -1,8 +1,6 @@ package org.apache.lucene.codecs.compressing; -import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.NormsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; import org.apache.lucene.util.packed.PackedInts; @@ -42,9 +40,4 @@ public class FastDecompressionCompressingCodec extends CompressingCodec { public NormsFormat normsFormat() { return new Lucene42NormsFormat(PackedInts.DEFAULT); } - - @Override - public DocValuesFormat docValuesFormat() { - return new Lucene42DocValuesFormat(PackedInts.DEFAULT); - } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java index c2c21d505f3..90c161cf696 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java @@ -24,7 +24,9 @@ import java.util.TreeSet; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.MissingOrdRemapper; import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosReader.LegacyDocValuesType; +import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; @@ -54,7 +56,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { long minValue = Long.MAX_VALUE; long maxValue = Long.MIN_VALUE; for (Number n : values) { - long v = n.longValue(); + long v = n == null ? 0 : n.longValue(); minValue = Math.min(minValue, v); maxValue = Math.max(maxValue, v); } @@ -92,7 +94,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { Lucene40DocValuesFormat.INTS_VERSION_CURRENT); output.writeInt(1); // size for (Number n : values) { - output.writeByte(n.byteValue()); + output.writeByte(n == null ? 0 : n.byteValue()); } } @@ -103,7 +105,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { Lucene40DocValuesFormat.INTS_VERSION_CURRENT); output.writeInt(2); // size for (Number n : values) { - output.writeShort(n.shortValue()); + output.writeShort(n == null ? 0 : n.shortValue()); } } @@ -114,7 +116,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { Lucene40DocValuesFormat.INTS_VERSION_CURRENT); output.writeInt(4); // size for (Number n : values) { - output.writeInt(n.intValue()); + output.writeInt(n == null ? 0 : n.intValue()); } } @@ -131,7 +133,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { // writes longs output.writeByte(Lucene40DocValuesFormat.VAR_INTS_FIXED_64); for (Number n : values) { - output.writeLong(n.longValue()); + output.writeLong(n == null ? 0 : n.longValue()); } } else { // writes packed ints @@ -143,7 +145,8 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { PackedInts.bitsRequired(delta), PackedInts.DEFAULT); for (Number n : values) { - writer.add(n.longValue() - minValue); + long v = n == null ? 0 : n.longValue(); + writer.add(v - minValue); } writer.finish(); } @@ -156,6 +159,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { int minLength = Integer.MAX_VALUE; int maxLength = Integer.MIN_VALUE; for (BytesRef b : values) { + if (b == null) { + b = new BytesRef(); // 4.0 doesnt distinguish + } if (b.length > Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH) { throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH); } @@ -246,7 +252,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { output.writeInt(length); for (BytesRef v : values) { - output.writeBytes(v.bytes, v.offset, v.length); + if (v != null) { + output.writeBytes(v.bytes, v.offset, v.length); + } } } @@ -267,7 +275,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { final long startPos = data.getFilePointer(); for (BytesRef v : values) { - data.writeBytes(v.bytes, v.offset, v.length); + if (v != null) { + data.writeBytes(v.bytes, v.offset, v.length); + } } /* addresses */ @@ -282,7 +292,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { long currentPosition = 0; for (BytesRef v : values) { w.add(currentPosition); - currentPosition += v.length; + if (v != null) { + currentPosition += v.length; + } } // write sentinel assert currentPosition == maxAddress; @@ -304,7 +316,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { // deduplicate TreeSet dictionary = new TreeSet(); for (BytesRef v : values) { - dictionary.add(BytesRef.deepCopyOf(v)); + dictionary.add(v == null ? new BytesRef() : BytesRef.deepCopyOf(v)); } /* values */ @@ -321,6 +333,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(valueCount-1), PackedInts.DEFAULT); for (BytesRef v : values) { + if (v == null) { + v = new BytesRef(); + } int ord = dictionary.headSet(v).size(); w.add(ord); } @@ -341,7 +356,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { // deduplicate TreeSet dictionary = new TreeSet(); for (BytesRef v : values) { - dictionary.add(BytesRef.deepCopyOf(v)); + dictionary.add(v == null ? new BytesRef() : BytesRef.deepCopyOf(v)); } /* values */ @@ -362,7 +377,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(currentAddress), PackedInts.DEFAULT); for (BytesRef v : values) { - w.add(valueToAddress.get(v)); + w.add(valueToAddress.get(v == null ? new BytesRef() : v)); } w.finish(); } @@ -388,6 +403,15 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { maxLength = Math.max(maxLength, b.length); } + // but dont use fixed if there are missing values (we are simulating how lucene40 wrote dv...) + boolean anyMissing = false; + for (Number n : docToOrd) { + if (n.longValue() == -1) { + anyMissing = true; + break; + } + } + boolean success = false; IndexOutput data = null; IndexOutput index = null; @@ -397,12 +421,22 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { try { data = dir.createOutput(dataName, state.context); index = dir.createOutput(indexName, state.context); - if (minLength == maxLength) { + if (minLength == maxLength && !anyMissing) { // fixed byte[] addFixedSortedBytesField(field, data, index, values, docToOrd, minLength); } else { // var byte[] - addVarSortedBytesField(field, data, index, values, docToOrd); + // three cases for simulating the old writer: + // 1. no missing + // 2. missing (and empty string in use): remap ord=-1 -> ord=0 + // 3. missing (and empty string not in use): remap all ords +1, insert empty string into values + if (!anyMissing) { + addVarSortedBytesField(field, data, index, values, docToOrd); + } else if (minLength == 0) { + addVarSortedBytesField(field, data, index, values, MissingOrdRemapper.mapMissingToOrd0(docToOrd)); + } else { + addVarSortedBytesField(field, data, index, MissingOrdRemapper.insertEmptyValue(values), MissingOrdRemapper.mapAllOrds(docToOrd)); + } } success = true; } finally { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java similarity index 82% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java rename to lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java index dd11efaff82..ac8aeafd33c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java @@ -25,6 +25,7 @@ import java.util.NoSuchElementException; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.MissingOrdRemapper; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; @@ -45,25 +46,20 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; import org.apache.lucene.util.packed.PackedInts.FormatAndBits; import org.apache.lucene.util.packed.PackedInts; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.BYTES; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.NUMBER; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.FST; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.DELTA_COMPRESSED; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.GCD_COMPRESSED; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.TABLE_COMPRESSED; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.UNCOMPRESSED; + /** * Writer for {@link Lucene42DocValuesFormat} */ class Lucene42DocValuesConsumer extends DocValuesConsumer { - static final int VERSION_START = 0; - static final int VERSION_GCD_COMPRESSION = 1; - static final int VERSION_CURRENT = VERSION_GCD_COMPRESSION; - - static final byte NUMBER = 0; - static final byte BYTES = 1; - static final byte FST = 2; - - static final int BLOCK_SIZE = 4096; - - static final byte DELTA_COMPRESSED = 0; - static final byte TABLE_COMPRESSED = 1; - static final byte UNCOMPRESSED = 2; - static final byte GCD_COMPRESSED = 3; - final IndexOutput data, meta; final int maxDoc; final float acceptableOverheadRatio; @@ -106,7 +102,8 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { long count = 0; for (Number nv : values) { - final long v = nv.longValue(); + // TODO: support this as MemoryDVFormat (and be smart about missing maybe) + final long v = nv == null ? 0 : nv.longValue(); if (gcd != 1) { if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { @@ -142,7 +139,7 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) { meta.writeByte(UNCOMPRESSED); // uncompressed for (Number nv : values) { - data.writeByte((byte) nv.longValue()); + data.writeByte(nv == null ? 0 : (byte) nv.longValue()); } } else { meta.writeByte(TABLE_COMPRESSED); // table-compressed @@ -160,7 +157,7 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); for(Number nv : values) { - writer.add(encode.get(nv.longValue())); + writer.add(encode.get(nv == null ? 0 : nv.longValue())); } writer.finish(); } @@ -173,7 +170,8 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); for (Number nv : values) { - writer.add((nv.longValue() - minValue) / gcd); + long value = nv == null ? 0 : nv.longValue(); + writer.add((value - minValue) / gcd); } writer.finish(); } else { @@ -184,7 +182,7 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); for (Number nv : values) { - writer.add(nv.longValue()); + writer.add(nv == null ? 0 : nv.longValue()); } writer.finish(); } @@ -216,12 +214,15 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { int maxLength = Integer.MIN_VALUE; final long startFP = data.getFilePointer(); for(BytesRef v : values) { - if (v.length > Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH) { + final int length = v == null ? 0 : v.length; + if (length > Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH) { throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH); } - minLength = Math.min(minLength, v.length); - maxLength = Math.max(maxLength, v.length); - data.writeBytes(v.bytes, v.offset, v.length); + minLength = Math.min(minLength, length); + maxLength = Math.max(maxLength, length); + if (v != null) { + data.writeBytes(v.bytes, v.offset, v.length); + } } meta.writeLong(startFP); meta.writeLong(data.getFilePointer() - startFP); @@ -237,7 +238,9 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE); long addr = 0; for (BytesRef v : values) { - addr += v.length; + if (v != null) { + addr += v.length; + } writer.add(addr); } writer.finish(); @@ -265,6 +268,33 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { @Override public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + // three cases for simulating the old writer: + // 1. no missing + // 2. missing (and empty string in use): remap ord=-1 -> ord=0 + // 3. missing (and empty string not in use): remap all ords +1, insert empty string into values + boolean anyMissing = false; + for (Number n : docToOrd) { + if (n.longValue() == -1) { + anyMissing = true; + break; + } + } + + boolean hasEmptyString = false; + for (BytesRef b : values) { + hasEmptyString = b.length == 0; + break; + } + + if (!anyMissing) { + // nothing to do + } else if (hasEmptyString) { + docToOrd = MissingOrdRemapper.mapMissingToOrd0(docToOrd); + } else { + docToOrd = MissingOrdRemapper.mapAllOrds(docToOrd); + values = MissingOrdRemapper.insertEmptyValue(values); + } + // write the ordinals as numerics addNumericField(field, docToOrd, false); diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java new file mode 100644 index 00000000000..5ef50ea522f --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java @@ -0,0 +1,39 @@ +package org.apache.lucene.codecs.lucene42; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.NormsFormat; + +/** + * Read-write version of {@link Lucene42Codec} for testing. + */ +public class Lucene42RWCodec extends Lucene42Codec { + private static final DocValuesFormat dv = new Lucene42RWDocValuesFormat(); + private static final NormsFormat norms = new Lucene42NormsFormat(); + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return dv; + } + + @Override + public NormsFormat normsFormat() { + return norms; + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWDocValuesFormat.java new file mode 100644 index 00000000000..45372246184 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWDocValuesFormat.java @@ -0,0 +1,35 @@ +package org.apache.lucene.codecs.lucene42; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.index.SegmentWriteState; + +/** + * Read-write version of {@link Lucene42DocValuesFormat} for testing. + */ +public class Lucene42RWDocValuesFormat extends Lucene42DocValuesFormat { + + @Override + public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + // note: we choose DEFAULT here (its reasonably fast, and for small bpv has tiny waste) + return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/package.html b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/package.html new file mode 100644 index 00000000000..f1c62d1e049 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/package.html @@ -0,0 +1,25 @@ + + + + + + + +Support for testing {@link org.apache.lucene.codecs.lucene42.Lucene42Codec}. + + \ No newline at end of file diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/package.html b/lucene/test-framework/src/java/org/apache/lucene/codecs/package.html new file mode 100644 index 00000000000..ca70ffc3b2e --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/package.html @@ -0,0 +1,25 @@ + + + + + + + +Codecs for testing (simulate old disk formats, wacky theoretical use cases, etc) + + diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java index 6db5ccdeea3..086cb21fb89 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java @@ -438,14 +438,14 @@ public class AssertingAtomicReader extends FilterAtomicReader { this.in = in; this.maxDoc = maxDoc; this.valueCount = in.getValueCount(); - assert valueCount >= 1 && valueCount <= maxDoc; + assert valueCount >= 0 && valueCount <= maxDoc; } @Override public int getOrd(int docID) { assert docID >= 0 && docID < maxDoc; int ord = in.getOrd(docID); - assert ord >= 0 && ord < valueCount; + assert ord >= -1 && ord < valueCount; return ord; } @@ -607,6 +607,54 @@ public class AssertingAtomicReader extends FilterAtomicReader { return null; } } + + /** Wraps a Bits but with additional asserts */ + public static class AssertingBits implements Bits { + final Bits in; + + public AssertingBits(Bits in) { + this.in = in; + } + + @Override + public boolean get(int index) { + assert index >= 0 && index < length(); + return in.get(index); + } + + @Override + public int length() { + return in.length(); + } + } + + @Override + public Bits getLiveDocs() { + Bits liveDocs = super.getLiveDocs(); + if (liveDocs != null) { + assert maxDoc() == liveDocs.length(); + liveDocs = new AssertingBits(liveDocs); + } else { + assert maxDoc() == numDocs(); + assert !hasDeletions(); + } + return liveDocs; + } + + @Override + public Bits getDocsWithField(String field) throws IOException { + Bits docsWithField = super.getDocsWithField(field); + FieldInfo fi = getFieldInfos().fieldInfo(field); + if (docsWithField != null) { + assert fi != null; + assert fi.hasDocValues(); + assert maxDoc() == docsWithField.length(); + docsWithField = new AssertingBits(docsWithField); + } else { + assert fi == null || fi.hasDocValues() == false; + } + return docsWithField; + } // this is the same hack as FCInvisible @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index d4f8866732c..9c94bb61567 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -26,6 +26,7 @@ import java.util.Map.Entry; import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.concurrent.CountDownLatch; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; @@ -51,6 +52,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.LuceneTestCase; @@ -649,6 +651,43 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { ireader.close(); directory.close(); } + + public void testSortedMergeAwayAllValues() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.NO)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedDocValuesField("field", new BytesRef("hello"))); + iwriter.addDocument(doc); + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedDocValues dv = getOnlySegmentReader(ireader).getSortedDocValues("field"); + if (defaultCodecSupportsDocsWithField()) { + assertEquals(-1, dv.getOrd(0)); + assertEquals(0, dv.getValueCount()); + } else { + assertEquals(0, dv.getOrd(0)); + assertEquals(1, dv.getValueCount()); + BytesRef ref = new BytesRef(); + dv.lookupOrd(0, ref); + assertEquals(new BytesRef(), ref); + } + + ireader.close(); + directory.close(); + } public void testBytesWithNewline() throws IOException { Analyzer analyzer = new MockAnalyzer(random()); @@ -695,7 +734,10 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { BytesRef scratch = new BytesRef(); dv.lookupOrd(dv.getOrd(0), scratch); assertEquals(new BytesRef("hello world 2"), scratch); - dv.lookupOrd(dv.getOrd(1), scratch); + if (defaultCodecSupportsDocsWithField()) { + assertEquals(-1, dv.getOrd(1)); + } + dv.get(1, scratch); assertEquals(new BytesRef(""), scratch); ireader.close(); directory.close(); @@ -1074,8 +1116,10 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { doc.add(newTextField("id", "noValue", Field.Store.YES)); w.addDocument(doc); } - BytesRef bytesRef = new BytesRef(); - hash.add(bytesRef); // add empty value for the gaps + if (!defaultCodecSupportsDocsWithField()) { + BytesRef bytesRef = new BytesRef(); + hash.add(bytesRef); // add empty value for the gaps + } if (rarely()) { w.commit(); } @@ -1189,6 +1233,73 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { dir.close(); } + private void doTestMissingVsFieldCache(final long minValue, final long maxValue) throws Exception { + doTestMissingVsFieldCache(new LongProducer() { + @Override + long next() { + return _TestUtil.nextLong(random(), minValue, maxValue); + } + }); + } + + private void doTestMissingVsFieldCache(LongProducer longs) throws Exception { + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Field idField = new StringField("id", "", Field.Store.NO); + Field indexedField = newStringField("indexed", "", Field.Store.NO); + Field dvField = new NumericDocValuesField("dv", 0); + + + // index some docs + int numDocs = atLeast(300); + // numDocs should be always > 256 so that in case of a codec that optimizes + // for numbers of values <= 256, all storage layouts are tested + assert numDocs > 256; + for (int i = 0; i < numDocs; i++) { + idField.setStringValue(Integer.toString(i)); + long value = longs.next(); + indexedField.setStringValue(Long.toString(value)); + dvField.setLongValue(value); + Document doc = new Document(); + doc.add(idField); + // 1/4 of the time we neglect to add the fields + if (random().nextInt(4) > 0) { + doc.add(indexedField); + doc.add(dvField); + } + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs/10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + + // merge some segments and ensure that at least one of them has more than + // 256 values + writer.forceMerge(numDocs / 256); + + writer.close(); + + // compare + DirectoryReader ir = DirectoryReader.open(dir); + for (AtomicReaderContext context : ir.leaves()) { + AtomicReader r = context.reader(); + Bits expected = FieldCache.DEFAULT.getDocsWithField(r, "indexed"); + Bits actual = FieldCache.DEFAULT.getDocsWithField(r, "dv"); + assertEquals(expected, actual); + } + ir.close(); + dir.close(); + } + public void testBooleanNumericsVsStoredFields() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { @@ -1203,6 +1314,13 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } } + public void testByteMissingVsFieldCache() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestMissingVsFieldCache(Byte.MIN_VALUE, Byte.MAX_VALUE); + } + } + public void testShortNumericsVsStoredFields() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { @@ -1210,6 +1328,13 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } } + public void testShortMissingVsFieldCache() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestMissingVsFieldCache(Short.MIN_VALUE, Short.MAX_VALUE); + } + } + public void testIntNumericsVsStoredFields() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { @@ -1217,6 +1342,13 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } } + public void testIntMissingVsFieldCache() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestMissingVsFieldCache(Integer.MIN_VALUE, Integer.MAX_VALUE); + } + } + public void testLongNumericsVsStoredFields() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { @@ -1224,6 +1356,13 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } } + public void testLongMissingVsFieldCache() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestMissingVsFieldCache(Long.MIN_VALUE, Long.MAX_VALUE); + } + } + private void doTestBinaryVsStoredFields(int minLength, int maxLength) throws Exception { Directory dir = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); @@ -1971,6 +2110,13 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { doTestSortedSetVsStoredFields(1, 10); } } + + private void assertEquals(Bits expected, Bits actual) throws Exception { + assertEquals(expected.length(), actual.length()); + for (int i = 0; i < expected.length(); i++) { + assertEquals(expected.get(i), actual.get(i)); + } + } private void assertEquals(int maxDoc, SortedDocValues expected, SortedDocValues actual) throws Exception { assertEquals(maxDoc, new SingletonSortedSetDocValues(expected), new SingletonSortedSetDocValues(actual)); @@ -2198,6 +2344,206 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { doTestNumericsVsStoredFields(longs); } } + + public void testTwoNumbersOneMissing() throws IOException { + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 0)); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + NumericDocValues dv = ar.getNumericDocValues("dv1"); + assertEquals(0, dv.get(0)); + assertEquals(0, dv.get(1)); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + ir.close(); + directory.close(); + } + + public void testTwoNumbersOneMissingWithMerging() throws IOException { + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 0)); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + NumericDocValues dv = ar.getNumericDocValues("dv1"); + assertEquals(0, dv.get(0)); + assertEquals(0, dv.get(1)); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + ir.close(); + directory.close(); + } + + public void testThreeNumbersOneMissingWithMerging() throws IOException { + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 0)); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "2", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 5)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + NumericDocValues dv = ar.getNumericDocValues("dv1"); + assertEquals(0, dv.get(0)); + assertEquals(0, dv.get(1)); + assertEquals(5, dv.get(2)); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + assertTrue(docsWithField.get(2)); + ir.close(); + directory.close(); + } + + public void testTwoBytesOneMissing() throws IOException { + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", new BytesRef())); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + BinaryDocValues dv = ar.getBinaryDocValues("dv1"); + BytesRef ref = new BytesRef(); + dv.get(0, ref); + assertEquals(new BytesRef(), ref); + dv.get(1, ref); + assertEquals(new BytesRef(), ref); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + ir.close(); + directory.close(); + } + + public void testTwoBytesOneMissingWithMerging() throws IOException { + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", new BytesRef())); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + BinaryDocValues dv = ar.getBinaryDocValues("dv1"); + BytesRef ref = new BytesRef(); + dv.get(0, ref); + assertEquals(new BytesRef(), ref); + dv.get(1, ref); + assertEquals(new BytesRef(), ref); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + ir.close(); + directory.close(); + } + + public void testThreeBytesOneMissingWithMerging() throws IOException { + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", new BytesRef())); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "2", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", new BytesRef("boo"))); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + BinaryDocValues dv = ar.getBinaryDocValues("dv1"); + BytesRef ref = new BytesRef(); + dv.get(0, ref); + assertEquals(new BytesRef(), ref); + dv.get(1, ref); + assertEquals(new BytesRef(), ref); + dv.get(2, ref); + assertEquals(new BytesRef("boo"), ref); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + assertTrue(docsWithField.get(2)); + ir.close(); + directory.close(); + } // LUCENE-4853 public void testHugeBinaryValues() throws Exception { @@ -2295,6 +2641,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { d.close(); } + // TODO: get this out of here and into the deprecated codecs (4.0, 4.2) public void testHugeBinaryValueLimit() throws Exception { // We only test DVFormats that have a limit assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field")); @@ -2363,6 +2710,243 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { ar.close(); d.close(); } + + /** Tests dv against stored fields with threads (binary/numeric/sorted, no missing) */ + public void testThreads() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Document doc = new Document(); + Field idField = new StringField("id", "", Field.Store.NO); + Field storedBinField = new StoredField("storedBin", new byte[0]); + Field dvBinField = new BinaryDocValuesField("dvBin", new BytesRef()); + Field dvSortedField = new SortedDocValuesField("dvSorted", new BytesRef()); + Field storedNumericField = new StoredField("storedNum", ""); + Field dvNumericField = new NumericDocValuesField("dvNum", 0); + doc.add(idField); + doc.add(storedBinField); + doc.add(dvBinField); + doc.add(dvSortedField); + doc.add(storedNumericField); + doc.add(dvNumericField); + + // index some docs + int numDocs = atLeast(300); + for (int i = 0; i < numDocs; i++) { + idField.setStringValue(Integer.toString(i)); + int length = _TestUtil.nextInt(random(), 0, 8); + byte buffer[] = new byte[length]; + random().nextBytes(buffer); + storedBinField.setBytesValue(buffer); + dvBinField.setBytesValue(buffer); + dvSortedField.setBytesValue(buffer); + long numericValue = random().nextLong(); + storedNumericField.setStringValue(Long.toString(numericValue)); + dvNumericField.setLongValue(numericValue); + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs/10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + writer.close(); + + // compare + final DirectoryReader ir = DirectoryReader.open(dir); + int numThreads = _TestUtil.nextInt(random(), 2, 7); + Thread threads[] = new Thread[numThreads]; + final CountDownLatch startingGun = new CountDownLatch(1); + + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread() { + @Override + public void run() { + try { + startingGun.await(); + for (AtomicReaderContext context : ir.leaves()) { + AtomicReader r = context.reader(); + BinaryDocValues binaries = r.getBinaryDocValues("dvBin"); + SortedDocValues sorted = r.getSortedDocValues("dvSorted"); + NumericDocValues numerics = r.getNumericDocValues("dvNum"); + for (int j = 0; j < r.maxDoc(); j++) { + BytesRef binaryValue = r.document(j).getBinaryValue("storedBin"); + BytesRef scratch = new BytesRef(); + binaries.get(j, scratch); + assertEquals(binaryValue, scratch); + sorted.get(j, scratch); + assertEquals(binaryValue, scratch); + String expected = r.document(j).get("storedNum"); + assertEquals(Long.parseLong(expected), numerics.get(j)); + } + } + _TestUtil.checkReader(ir); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + }; + threads[i].start(); + } + startingGun.countDown(); + for (Thread t : threads) { + t.join(); + } + ir.close(); + dir.close(); + } + + /** Tests dv against stored fields with threads (all types + missing) */ + public void testThreads2() throws Exception { + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); + assumeTrue("Codec does not support SORTED_SET", defaultCodecSupportsSortedSet()); + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Field idField = new StringField("id", "", Field.Store.NO); + Field storedBinField = new StoredField("storedBin", new byte[0]); + Field dvBinField = new BinaryDocValuesField("dvBin", new BytesRef()); + Field dvSortedField = new SortedDocValuesField("dvSorted", new BytesRef()); + Field storedNumericField = new StoredField("storedNum", ""); + Field dvNumericField = new NumericDocValuesField("dvNum", 0); + + // index some docs + int numDocs = atLeast(300); + for (int i = 0; i < numDocs; i++) { + idField.setStringValue(Integer.toString(i)); + int length = _TestUtil.nextInt(random(), 0, 8); + byte buffer[] = new byte[length]; + random().nextBytes(buffer); + storedBinField.setBytesValue(buffer); + dvBinField.setBytesValue(buffer); + dvSortedField.setBytesValue(buffer); + long numericValue = random().nextLong(); + storedNumericField.setStringValue(Long.toString(numericValue)); + dvNumericField.setLongValue(numericValue); + Document doc = new Document(); + doc.add(idField); + if (random().nextInt(4) > 0) { + doc.add(storedBinField); + doc.add(dvBinField); + doc.add(dvSortedField); + } + if (random().nextInt(4) > 0) { + doc.add(storedNumericField); + doc.add(dvNumericField); + } + int numSortedSetFields = random().nextInt(3); + Set values = new TreeSet(); + for (int j = 0; j < numSortedSetFields; j++) { + values.add(_TestUtil.randomSimpleString(random())); + } + for (String v : values) { + doc.add(new SortedSetDocValuesField("dvSortedSet", new BytesRef(v))); + doc.add(new StoredField("storedSortedSet", v)); + } + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs/10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + writer.close(); + + // compare + final DirectoryReader ir = DirectoryReader.open(dir); + int numThreads = _TestUtil.nextInt(random(), 2, 7); + Thread threads[] = new Thread[numThreads]; + final CountDownLatch startingGun = new CountDownLatch(1); + + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread() { + @Override + public void run() { + try { + startingGun.await(); + for (AtomicReaderContext context : ir.leaves()) { + AtomicReader r = context.reader(); + BinaryDocValues binaries = r.getBinaryDocValues("dvBin"); + Bits binaryBits = r.getDocsWithField("dvBin"); + SortedDocValues sorted = r.getSortedDocValues("dvSorted"); + Bits sortedBits = r.getDocsWithField("dvSorted"); + NumericDocValues numerics = r.getNumericDocValues("dvNum"); + Bits numericBits = r.getDocsWithField("dvNum"); + SortedSetDocValues sortedSet = r.getSortedSetDocValues("dvSortedSet"); + Bits sortedSetBits = r.getDocsWithField("dvSortedSet"); + for (int j = 0; j < r.maxDoc(); j++) { + BytesRef binaryValue = r.document(j).getBinaryValue("storedBin"); + if (binaryValue != null) { + if (binaries != null) { + BytesRef scratch = new BytesRef(); + binaries.get(j, scratch); + assertEquals(binaryValue, scratch); + sorted.get(j, scratch); + assertEquals(binaryValue, scratch); + assertTrue(binaryBits.get(j)); + assertTrue(sortedBits.get(j)); + } + } else if (binaries != null) { + assertFalse(binaryBits.get(j)); + assertFalse(sortedBits.get(j)); + assertEquals(-1, sorted.getOrd(j)); + } + + String number = r.document(j).get("storedNum"); + if (number != null) { + if (numerics != null) { + assertEquals(Long.parseLong(number), numerics.get(j)); + } + } else if (numerics != null) { + assertFalse(numericBits.get(j)); + assertEquals(0, numerics.get(j)); + } + + String values[] = r.document(j).getValues("storedSortedSet"); + if (values.length > 0) { + assertNotNull(sortedSet); + sortedSet.setDocument(j); + for (int i = 0; i < values.length; i++) { + long ord = sortedSet.nextOrd(); + assertTrue(ord != SortedSetDocValues.NO_MORE_ORDS); + BytesRef value = new BytesRef(); + sortedSet.lookupOrd(ord, value); + assertEquals(values[i], value.utf8ToString()); + } + assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSet.nextOrd()); + assertTrue(sortedSetBits.get(j)); + } else if (sortedSet != null) { + sortedSet.setDocument(j); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSet.nextOrd()); + assertFalse(sortedSetBits.get(j)); + } + } + } + _TestUtil.checkReader(ir); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + }; + threads[i].start(); + } + startingGun.countDown(); + for (Thread t : threads) { + t.join(); + } + ir.close(); + dir.close(); + } protected boolean codecAcceptsHugeBinaryValues(String field) { return true; diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java index 430948704a2..c25a0f3c764 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java @@ -32,7 +32,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.compressing.CompressingCodec; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.document.Document; import org.apache.lucene.document.DoubleField; @@ -502,7 +502,7 @@ public abstract class BaseStoredFieldsFormatTestCase extends LuceneTestCase { // get another codec, other than the default: so we are merging segments across different codecs final Codec otherCodec; if ("SimpleText".equals(Codec.getDefault().getName())) { - otherCodec = new Lucene42Codec(); + otherCodec = new Lucene45Codec(); } else { otherCodec = new SimpleTextCodec(); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java index a0a2521ef7d..66c24874b55 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.Set; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.FilterIterator; /** @@ -136,6 +137,11 @@ public final class FieldFilterAtomicReader extends FilterAtomicReader { return hasField(field) ? super.getNormValues(field) : null; } + @Override + public Bits getDocsWithField(String field) throws IOException { + return hasField(field) ? super.getDocsWithField(field) : null; + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("FieldFilterAtomicReader(reader="); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index 9ecc5122f33..3530a31ce21 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -35,12 +35,12 @@ import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; import org.apache.lucene.codecs.lucene41ords.Lucene41WithOrds; import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapDocFreqInterval; import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapFixedInterval; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; -import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat; import org.apache.lucene.codecs.bloom.TestBloomFilteredLucene41Postings; -import org.apache.lucene.codecs.cheapbastard.CheapBastardDocValuesFormat; import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; import org.apache.lucene.codecs.memory.DirectPostingsFormat; +import org.apache.lucene.codecs.memory.MemoryDocValuesFormat; import org.apache.lucene.codecs.memory.MemoryPostingsFormat; import org.apache.lucene.codecs.mockintblock.MockFixedIntBlockPostingsFormat; import org.apache.lucene.codecs.mockintblock.MockVariableIntBlockPostingsFormat; @@ -62,7 +62,7 @@ import org.apache.lucene.util._TestUtil; * documents in different orders and the test will still be deterministic * and reproducable. */ -public class RandomCodec extends Lucene42Codec { +public class RandomCodec extends Lucene45Codec { /** Shuffled list of postings formats to use for new mappings */ private List formats = new ArrayList(); @@ -148,11 +148,11 @@ public class RandomCodec extends Lucene42Codec { new MemoryPostingsFormat(false, random.nextFloat())); addDocValues(avoidCodecs, - new Lucene42DocValuesFormat(), + new Lucene45DocValuesFormat(), new DiskDocValuesFormat(), + new MemoryDocValuesFormat(), new SimpleTextDocValuesFormat(), - new AssertingDocValuesFormat(), - new CheapBastardDocValuesFormat()); + new AssertingDocValuesFormat()); Collections.shuffle(formats, random); Collections.shuffle(dvFormats, random); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java index 3060d683c26..1ca13974195 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java @@ -1368,6 +1368,16 @@ public abstract class LuceneTestCase extends Assert { } return true; } + + /** Returns true if the codec "supports" docsWithField + * (other codecs return MatchAllBits, because you couldnt write missing values before) */ + public static boolean defaultCodecSupportsDocsWithField() { + String name = Codec.getDefault().getName(); + if (name.equals("Lucene40") || name.equals("Lucene41") || name.equals("Lucene42")) { + return false; + } + return true; + } public void assertReaderEquals(String info, IndexReader leftReader, IndexReader rightReader) throws IOException { assertReaderStatisticsEquals(info, leftReader, rightReader); @@ -1962,6 +1972,20 @@ public abstract class LuceneTestCase extends Assert { assertNull(info, rightValues); } } + + { + Bits leftBits = MultiDocValues.getDocsWithField(leftReader, field); + Bits rightBits = MultiDocValues.getDocsWithField(rightReader, field); + if (leftBits != null && rightBits != null) { + assertEquals(info, leftBits.length(), rightBits.length()); + for (int i = 0; i < leftBits.length(); i++) { + assertEquals(info, leftBits.get(i), rightBits.get(i)); + } + } else { + assertNull(info, leftBits); + assertNull(info, rightBits); + } + } } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java index ebe0426a18c..54159212fab 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java @@ -40,6 +40,8 @@ import org.apache.lucene.codecs.lucene40.Lucene40RWCodec; import org.apache.lucene.codecs.lucene40.Lucene40RWPostingsFormat; import org.apache.lucene.codecs.lucene41.Lucene41RWCodec; import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene42.Lucene42RWCodec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.index.RandomCodec; import org.apache.lucene.search.RandomSimilarityProvider; @@ -161,6 +163,13 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { !shouldAvoidCodec("Lucene41"))) { codec = Codec.forName("Lucene41"); assert codec instanceof Lucene41RWCodec : "fix your classpath to have tests-framework.jar before lucene-core.jar"; + } else if ("Lucene42".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && + "random".equals(TEST_POSTINGSFORMAT) && + "random".equals(TEST_DOCVALUESFORMAT) && + randomVal == 2 && + !shouldAvoidCodec("Lucene42"))) { + codec = Codec.forName("Lucene42"); + assert codec instanceof Lucene42RWCodec : "fix your classpath to have tests-framework.jar before lucene-core.jar"; } else if (("random".equals(TEST_POSTINGSFORMAT) == false) || ("random".equals(TEST_DOCVALUESFORMAT) == false)) { // the user wired postings or DV: this is messy // refactor into RandomCodec.... @@ -182,7 +191,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { dvFormat = DocValuesFormat.forName(TEST_DOCVALUESFORMAT); } - codec = new Lucene42Codec() { + codec = new Lucene45Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return format; diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index 92e10d5869a..19c16bcd9bc 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -44,7 +44,7 @@ import java.util.zip.ZipFile; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; @@ -703,7 +703,7 @@ public class _TestUtil { if (LuceneTestCase.VERBOSE) { System.out.println("forcing postings format to:" + format); } - return new Lucene42Codec() { + return new Lucene45Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return format; @@ -721,7 +721,7 @@ public class _TestUtil { if (LuceneTestCase.VERBOSE) { System.out.println("forcing docvalues format to:" + format); } - return new Lucene42Codec() { + return new Lucene45Codec() { @Override public DocValuesFormat getDocValuesFormatForField(String field) { return format; @@ -743,6 +743,7 @@ public class _TestUtil { return p.getName(); } } + public static String getDocValuesFormat(String field) { return getDocValuesFormat(Codec.getDefault(), field); } @@ -756,11 +757,13 @@ public class _TestUtil { } } + // TODO: remove this, push this test to Lucene40/Lucene42 codec tests public static boolean fieldSupportsHugeBinaryDocValues(String field) { String dvFormat = getDocValuesFormat(field); - return dvFormat.equals("CheapBastard") || - dvFormat.equals("Disk") || - dvFormat.equals("SimpleText"); + if (dvFormat.equals("Lucene40") || dvFormat.equals("Lucene42") || dvFormat.equals("Memory")) { + return false; + } + return true; } public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException { diff --git a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 9bd5ca87fd7..554bb0fbe07 100644 --- a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -21,3 +21,4 @@ org.apache.lucene.codecs.compressing.HighCompressionCompressingCodec org.apache.lucene.codecs.compressing.dummy.DummyCompressingCodec org.apache.lucene.codecs.lucene40.Lucene40RWCodec org.apache.lucene.codecs.lucene41.Lucene41RWCodec +org.apache.lucene.codecs.lucene42.Lucene42RWCodec diff --git a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat index 2eeda2459c7..2086be1f0e7 100644 --- a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat +++ b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -14,4 +14,4 @@ # limitations under the License. org.apache.lucene.codecs.asserting.AssertingDocValuesFormat -org.apache.lucene.codecs.cheapbastard.CheapBastardDocValuesFormat \ No newline at end of file +org.apache.lucene.codecs.lucene42.Lucene42RWDocValuesFormat diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index bd9eb61aeea..4824132980e 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -96,6 +96,10 @@ New Features * SOLR-4718: Allow solr.xml to be stored in ZooKeeper * SOLR-5156: Enhance ZkCLI to allow uploading of arbitrary files to ZK. +* SOLR-5165: Single-valued docValues fields no longer require a default value. + Additionally they work with sortMissingFirst, sortMissingLast, facet.missing, + exists() in function queries, etc. (Robert Muir) + Bug Fixes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java b/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java index 52d48280945..49bca112fc2 100644 --- a/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java +++ b/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java @@ -3,7 +3,7 @@ package org.apache.solr.core; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.solr.common.util.NamedList; import org.apache.solr.schema.SchemaField; import org.apache.solr.util.plugin.SolrCoreAware; @@ -51,7 +51,7 @@ public class SchemaCodecFactory extends CodecFactory implements SolrCoreAware { @Override public void init(NamedList args) { super.init(args); - codec = new Lucene42Codec() { + codec = new Lucene45Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { final SchemaField fieldOrNull = core.getLatestSchema().getFieldOrNull(field); diff --git a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java index 4dfc114a189..fb424fdee8b 100644 --- a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java +++ b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java @@ -218,11 +218,7 @@ public class DocValuesFacets { static NamedList finalize(NamedList res, SolrIndexSearcher searcher, SchemaField schemaField, DocSet docs, int missingCount, boolean missing) throws IOException { if (missing) { if (missingCount < 0) { - if (schemaField.multiValued()) { - missingCount = SimpleFacets.getFieldMissingCount(searcher,docs,schemaField.getName()); - } else { - missingCount = 0; // single-valued dv is implicitly 0 - } + missingCount = SimpleFacets.getFieldMissingCount(searcher,docs,schemaField.getName()); } res.add(null, missingCount); } @@ -231,12 +227,12 @@ public class DocValuesFacets { } /** accumulates per-segment single-valued facet counts, mapping to global ordinal space */ - // specialized since the single-valued case is simpler: you don't have to deal with missing count, etc + // specialized since the single-valued case is different static void accumSingle(int counts[], int startTermIndex, SortedDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException { int doc; while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { int term = si.getOrd(doc); - if (map != null) { + if (map != null && term >= 0) { term = (int) map.getGlobalOrd(subIndex, term); } int arrIdx = term-startTermIndex; diff --git a/solr/core/src/java/org/apache/solr/request/NumericFacets.java b/solr/core/src/java/org/apache/solr/request/NumericFacets.java index 96796c996be..62950e2e7ca 100644 --- a/solr/core/src/java/org/apache/solr/request/NumericFacets.java +++ b/solr/core/src/java/org/apache/solr/request/NumericFacets.java @@ -190,8 +190,9 @@ final class NumericFacets { } docsWithField = FieldCache.DEFAULT.getDocsWithField(ctx.reader(), fieldName); } - if (docsWithField.get(doc - ctx.docBase)) { - hashTable.add(doc, longs.get(doc - ctx.docBase), 1); + long v = longs.get(doc - ctx.docBase); + if (v != 0 || docsWithField.get(doc - ctx.docBase)) { + hashTable.add(doc, v, 1); } else { ++missingCount; } @@ -254,7 +255,7 @@ final class NumericFacets { if (zeros && (limit < 0 || result.size() < limit)) { // need to merge with the term dict if (!sf.indexed()) { - throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on a field which is not indexed"); + throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on field " + sf.getName() + " which is not indexed"); } // Add zeros until there are limit results final Set alreadySeen = new HashSet(); diff --git a/solr/core/src/java/org/apache/solr/schema/SchemaField.java b/solr/core/src/java/org/apache/solr/schema/SchemaField.java index 5a1fb5b82aa..ca6bd90d1b5 100644 --- a/solr/core/src/java/org/apache/solr/schema/SchemaField.java +++ b/solr/core/src/java/org/apache/solr/schema/SchemaField.java @@ -239,13 +239,21 @@ public final class SchemaField extends FieldProperties { if (on(falseProps,INDEXED)) { int pp = (INDEXED - | STORE_TERMVECTORS | STORE_TERMPOSITIONS | STORE_TERMOFFSETS - | SORT_MISSING_FIRST | SORT_MISSING_LAST); + | STORE_TERMVECTORS | STORE_TERMPOSITIONS | STORE_TERMOFFSETS); if (on(pp,trueProps)) { throw new RuntimeException("SchemaField: " + name + " conflicting 'true' field options for non-indexed field:" + props); } p &= ~pp; } + + if (on(falseProps,INDEXED) && on(falseProps,DOC_VALUES)) { + int pp = (SORT_MISSING_FIRST | SORT_MISSING_LAST); + if (on(pp,trueProps)) { + throw new RuntimeException("SchemaField: " + name + " conflicting 'true' field options for non-indexed/non-docValues field:" + props); + } + p &= ~pp; + } + if (on(falseProps,INDEXED)) { int pp = (OMIT_NORMS | OMIT_TF_POSITIONS | OMIT_POSITIONS); if (on(pp,falseProps)) { diff --git a/solr/core/src/java/org/apache/solr/schema/StrField.java b/solr/core/src/java/org/apache/solr/schema/StrField.java index 7c59741d7d8..2c9600c67de 100644 --- a/solr/core/src/java/org/apache/solr/schema/StrField.java +++ b/solr/core/src/java/org/apache/solr/schema/StrField.java @@ -80,9 +80,6 @@ public class StrField extends PrimitiveFieldType { @Override public void checkSchemaField(SchemaField field) { - if (field.hasDocValues() && !field.multiValued() && !(field.isRequired() || field.getDefaultValue() != null)) { - throw new IllegalStateException("Field " + this + " has single-valued doc values enabled, but has no default value and is not required"); - } } } diff --git a/solr/core/src/java/org/apache/solr/schema/TrieField.java b/solr/core/src/java/org/apache/solr/schema/TrieField.java index 99cff2204b5..d0e92f7e427 100644 --- a/solr/core/src/java/org/apache/solr/schema/TrieField.java +++ b/solr/core/src/java/org/apache/solr/schema/TrieField.java @@ -696,9 +696,6 @@ public class TrieField extends PrimitiveFieldType { @Override public void checkSchemaField(final SchemaField field) { - if (field.hasDocValues() && !field.multiValued() && !(field.isRequired() || field.getDefaultValue() != null)) { - throw new IllegalStateException("Field " + this + " has single-valued doc values enabled, but has no default value and is not required"); - } } } diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml b/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml index e811f91e8a1..0e3116d0797 100755 --- a/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml @@ -26,17 +26,16 @@ - - + - - + + - + id diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-docValuesMissing.xml b/solr/core/src/test-files/solr/collection1/conf/schema-docValuesMissing.xml new file mode 100644 index 00000000000..05f73e15f29 --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/schema-docValuesMissing.xml @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + diff --git a/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml b/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml index 15074809892..73593829cf9 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml @@ -22,7 +22,7 @@ - + diff --git a/solr/core/src/test/org/apache/solr/TestRandomDVFaceting.java b/solr/core/src/test/org/apache/solr/TestRandomDVFaceting.java index df36b6b593b..b6581af49bf 100644 --- a/solr/core/src/test/org/apache/solr/TestRandomDVFaceting.java +++ b/solr/core/src/test/org/apache/solr/TestRandomDVFaceting.java @@ -39,7 +39,7 @@ import org.junit.Test; * to the indexed facet results as if it were just another faceting method. */ @Slow -@SuppressCodecs({"Lucene40", "Lucene41"}) +@SuppressCodecs({"Lucene40", "Lucene41", "Lucene42"}) public class TestRandomDVFaceting extends SolrTestCaseJ4 { @BeforeClass @@ -162,6 +162,8 @@ public class TestRandomDVFaceting extends SolrTestCaseJ4 { SchemaField sf = req.getSchema().getField(ftype.fname); boolean multiValued = sf.getType().multiValuedFieldCache(); + boolean indexed = sf.indexed(); + boolean numeric = sf.getType().getNumericType() != null; int offset = 0; if (rand.nextInt(100) < 20) { @@ -179,8 +181,21 @@ public class TestRandomDVFaceting extends SolrTestCaseJ4 { params.add("facet.limit", Integer.toString(limit)); } - if (rand.nextBoolean()) { - params.add("facet.sort", rand.nextBoolean() ? "index" : "count"); + // the following two situations cannot work for unindexed single-valued numerics: + // (currently none of the dv fields in this test config) + // facet.sort = index + // facet.minCount = 0 + if (!numeric || sf.multiValued()) { + if (rand.nextBoolean()) { + params.add("facet.sort", rand.nextBoolean() ? "index" : "count"); + } + + if (rand.nextInt(100) < 10) { + params.add("facet.mincount", Integer.toString(rand.nextInt(5))); + } + } else { + params.add("facet.sort", "count"); + params.add("facet.mincount", Integer.toString(1+rand.nextInt(5))); } if ((ftype.vals instanceof SVal) && rand.nextInt(100) < 20) { @@ -192,10 +207,6 @@ public class TestRandomDVFaceting extends SolrTestCaseJ4 { params.add("facet.prefix", prefix); } - if (rand.nextInt(100) < 10) { - params.add("facet.mincount", Integer.toString(rand.nextInt(5))); - } - if (rand.nextInt(100) < 20) { params.add("facet.missing", "true"); } diff --git a/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java b/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java index 0f06d44e2fa..c970652921b 100644 --- a/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java +++ b/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java @@ -55,10 +55,10 @@ public class TestCodecSupport extends SolrTestCaseJ4 { PerFieldDocValuesFormat format = (PerFieldDocValuesFormat) codec.docValuesFormat(); assertEquals("Disk", format.getDocValuesFormatForField(schemaField.getName()).getName()); schemaField = fields.get("string_memory_f"); - assertEquals("Lucene42", + assertEquals("Lucene45", format.getDocValuesFormatForField(schemaField.getName()).getName()); schemaField = fields.get("string_f"); - assertEquals("Lucene42", + assertEquals("Lucene45", format.getDocValuesFormatForField(schemaField.getName()).getName()); } @@ -80,8 +80,8 @@ public class TestCodecSupport extends SolrTestCaseJ4 { assertEquals("Disk", format.getDocValuesFormatForField("foo_disk").getName()); assertEquals("Disk", format.getDocValuesFormatForField("bar_disk").getName()); - assertEquals("Lucene42", format.getDocValuesFormatForField("foo_memory").getName()); - assertEquals("Lucene42", format.getDocValuesFormatForField("bar_memory").getName()); + assertEquals("Lucene45", format.getDocValuesFormatForField("foo_memory").getName()); + assertEquals("Lucene45", format.getDocValuesFormatForField("bar_memory").getName()); } public void testUnknownField() { diff --git a/solr/core/src/test/org/apache/solr/schema/BadIndexSchemaTest.java b/solr/core/src/test/org/apache/solr/schema/BadIndexSchemaTest.java index 1a2693e56fd..a4453ddfbc3 100644 --- a/solr/core/src/test/org/apache/solr/schema/BadIndexSchemaTest.java +++ b/solr/core/src/test/org/apache/solr/schema/BadIndexSchemaTest.java @@ -93,10 +93,6 @@ public class BadIndexSchemaTest extends AbstractBadConfigTestBase { doTest("bad-schema-codec-global-vs-ft-mismatch.xml", "codec does not support"); } - public void testDocValuesNotRequiredNoDefault() throws Exception { - doTest("bad-schema-docValues-not-required-no-default.xml", "has no default value and is not required"); - } - public void testDocValuesUnsupported() throws Exception { doTest("bad-schema-unsupported-docValues.xml", "does not support doc values"); } diff --git a/solr/core/src/test/org/apache/solr/schema/DocValuesMissingTest.java b/solr/core/src/test/org/apache/solr/schema/DocValuesMissingTest.java new file mode 100644 index 00000000000..a0dc27bc70c --- /dev/null +++ b/solr/core/src/test/org/apache/solr/schema/DocValuesMissingTest.java @@ -0,0 +1,479 @@ +package org.apache.solr.schema; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; +import org.apache.solr.SolrTestCaseJ4; +import org.junit.BeforeClass; + +/** + * Tests things like sorting on docvalues with missing values + */ +@SuppressCodecs({"Lucene40", "Lucene41", "Lucene42"}) // old formats cannot represent missing values +public class DocValuesMissingTest extends SolrTestCaseJ4 { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-basic.xml", "schema-docValuesMissing.xml"); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + clearIndex(); + assertU(commit()); + } + + /** float with default lucene sort (treats as 0) */ + public void testFloatSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "floatdv", "-1.3")); + assertU(adoc("id", "2", "floatdv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "floatdv asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "floatdv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** float with sort missing always first */ + public void testFloatSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "floatdv_missingfirst", "-1.3")); + assertU(adoc("id", "2", "floatdv_missingfirst", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "floatdv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "floatdv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** float with sort missing always last */ + public void testFloatSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "floatdv_missinglast", "-1.3")); + assertU(adoc("id", "2", "floatdv_missinglast", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "floatdv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "floatdv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** float function query based on missing */ + public void testFloatMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "floatdv", "-1.3")); + assertU(adoc("id", "2", "floatdv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(floatdv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** float missing facet count */ + public void testFloatMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "floatdv", "-1.3")); + assertU(adoc("id", "3", "floatdv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "floatdv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='floatdv']/int[@name='-1.3'][.=1]", + "//lst[@name='facet_fields']/lst[@name='floatdv']/int[@name='4.2'][.=1]", + "//lst[@name='facet_fields']/lst[@name='floatdv']/int[.=2]"); + } + + /** int with default lucene sort (treats as 0) */ + public void testIntSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "intdv", "-1")); + assertU(adoc("id", "2", "intdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "intdv asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "intdv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** int with sort missing always first */ + public void testIntSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "intdv_missingfirst", "-1")); + assertU(adoc("id", "2", "intdv_missingfirst", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "intdv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "intdv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** int with sort missing always last */ + public void testIntSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "intdv_missinglast", "-1")); + assertU(adoc("id", "2", "intdv_missinglast", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "intdv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "intdv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** int function query based on missing */ + public void testIntMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "intdv", "-1")); + assertU(adoc("id", "2", "intdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(intdv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** int missing facet count */ + public void testIntMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "intdv", "-1")); + assertU(adoc("id", "3", "intdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "intdv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='intdv']/int[@name='-1'][.=1]", + "//lst[@name='facet_fields']/lst[@name='intdv']/int[@name='4'][.=1]", + "//lst[@name='facet_fields']/lst[@name='intdv']/int[.=2]"); + } + + /** double with default lucene sort (treats as 0) */ + public void testDoubleSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "doubledv", "-1.3")); + assertU(adoc("id", "2", "doubledv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "doubledv asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "doubledv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** double with sort missing always first */ + public void testDoubleSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "doubledv_missingfirst", "-1.3")); + assertU(adoc("id", "2", "doubledv_missingfirst", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "doubledv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "doubledv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** double with sort missing always last */ + public void testDoubleSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "doubledv_missinglast", "-1.3")); + assertU(adoc("id", "2", "doubledv_missinglast", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "doubledv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "doubledv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** double function query based on missing */ + public void testDoubleMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "doubledv", "-1.3")); + assertU(adoc("id", "2", "doubledv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(doubledv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** double missing facet count */ + public void testDoubleMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "doubledv", "-1.3")); + assertU(adoc("id", "3", "doubledv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "doubledv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='doubledv']/int[@name='-1.3'][.=1]", + "//lst[@name='facet_fields']/lst[@name='doubledv']/int[@name='4.2'][.=1]", + "//lst[@name='facet_fields']/lst[@name='doubledv']/int[.=2]"); + } + + /** long with default lucene sort (treats as 0) */ + public void testLongSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "longdv", "-1")); + assertU(adoc("id", "2", "longdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "longdv asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "longdv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** long with sort missing always first */ + public void testLongSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "longdv_missingfirst", "-1")); + assertU(adoc("id", "2", "longdv_missingfirst", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "longdv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "longdv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** long with sort missing always last */ + public void testLongSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "longdv_missinglast", "-1")); + assertU(adoc("id", "2", "longdv_missinglast", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "longdv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "longdv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** long function query based on missing */ + public void testLongMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "longdv", "-1")); + assertU(adoc("id", "2", "longdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(longdv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** long missing facet count */ + public void testLongMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "longdv", "-1")); + assertU(adoc("id", "3", "longdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "longdv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='longdv']/int[@name='-1'][.=1]", + "//lst[@name='facet_fields']/lst[@name='longdv']/int[@name='4'][.=1]", + "//lst[@name='facet_fields']/lst[@name='longdv']/int[.=2]"); + } + + /** date with default lucene sort (treats as 1970) */ + public void testDateSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "datedv", "1900-12-31T23:59:59.999Z")); + assertU(adoc("id", "2", "datedv", "2005-12-31T23:59:59.999Z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "datedv asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "datedv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** date with sort missing always first */ + public void testDateSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "datedv_missingfirst", "1900-12-31T23:59:59.999Z")); + assertU(adoc("id", "2", "datedv_missingfirst", "2005-12-31T23:59:59.999Z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "datedv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "datedv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** date with sort missing always last */ + public void testDateSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "datedv_missinglast", "1900-12-31T23:59:59.999Z")); + assertU(adoc("id", "2", "datedv_missinglast", "2005-12-31T23:59:59.999Z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "datedv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "datedv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** date function query based on missing */ + public void testDateMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "datedv", "1900-12-31T23:59:59.999Z")); + assertU(adoc("id", "2", "datedv", "2005-12-31T23:59:59.999Z")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(datedv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** date missing facet count */ + public void testDateMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "datedv", "1900-12-31T23:59:59.999Z")); + assertU(adoc("id", "3", "datedv", "2005-12-31T23:59:59.999Z")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "datedv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='datedv']/int[@name='1900-12-31T23:59:59.999Z'][.=1]", + "//lst[@name='facet_fields']/lst[@name='datedv']/int[@name='2005-12-31T23:59:59.999Z'][.=1]", + "//lst[@name='facet_fields']/lst[@name='datedv']/int[.=2]"); + } + + /** string with default lucene sort (treats as "") */ + public void testStringSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "stringdv", "a")); + assertU(adoc("id", "2", "stringdv", "z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "stringdv asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "stringdv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** string with sort missing always first */ + public void testStringSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "stringdv_missingfirst", "a")); + assertU(adoc("id", "2", "stringdv_missingfirst", "z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "stringdv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "stringdv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** string with sort missing always last */ + public void testStringSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "stringdv_missinglast", "a")); + assertU(adoc("id", "2", "stringdv_missinglast", "z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "stringdv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "stringdv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** string function query based on missing */ + public void testStringMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "stringdv", "a")); + assertU(adoc("id", "2", "stringdv", "z")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(stringdv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** string missing facet count */ + public void testStringMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "stringdv", "a")); + assertU(adoc("id", "3", "stringdv", "z")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "stringdv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='stringdv']/int[@name='a'][.=1]", + "//lst[@name='facet_fields']/lst[@name='stringdv']/int[@name='z'][.=1]", + "//lst[@name='facet_fields']/lst[@name='stringdv']/int[.=2]"); + } +} diff --git a/solr/core/src/test/org/apache/solr/search/TestDocSet.java b/solr/core/src/test/org/apache/solr/search/TestDocSet.java index faebe6c261b..c2a282066e5 100644 --- a/solr/core/src/test/org/apache/solr/search/TestDocSet.java +++ b/solr/core/src/test/org/apache/solr/search/TestDocSet.java @@ -403,6 +403,11 @@ public class TestDocSet extends LuceneTestCase { return null; } + @Override + public Bits getDocsWithField(String field) throws IOException { + return null; + } + @Override public NumericDocValues getNormValues(String field) { return null; diff --git a/solr/example/solr/collection1/conf/schema.xml b/solr/example/solr/collection1/conf/schema.xml index 9cdd2976026..75fad489f25 100755 --- a/solr/example/solr/collection1/conf/schema.xml +++ b/solr/example/solr/collection1/conf/schema.xml @@ -168,8 +168,10 @@