From 1c982a5f566dbd15cd1ff51652138e6924fa9d9e Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 16 Aug 2013 10:24:36 +0000 Subject: [PATCH 01/16] create branch git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1514641 13f79535-47bb-0310-9956-ffa450edef68 From 3be8ed1d108857904a4af0fe75ca86c4de50368a Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 16 Aug 2013 10:25:38 +0000 Subject: [PATCH 02/16] LUCENE-5178: add 'missing' support to docvalues (simpletext only) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1514642 13f79535-47bb-0310-9956-ffa450edef68 --- .../codecs/diskdv/DiskDocValuesConsumer.java | 52 ++++- .../codecs/diskdv/DiskDocValuesProducer.java | 14 ++ .../simpletext/SimpleTextDocValuesFormat.java | 11 +- .../simpletext/SimpleTextDocValuesReader.java | 99 ++++++++- .../simpletext/SimpleTextDocValuesWriter.java | 32 ++- .../lucene/codecs/DocValuesConsumer.java | 47 ++-- .../lucene/codecs/DocValuesProducer.java | 62 ++++++ .../lucene/codecs/MissingOrdRemapper.java | 124 +++++++++++ .../lucene40/Lucene40DocValuesReader.java | 6 + .../lucene42/Lucene42DocValuesConsumer.java | 53 ++++- .../lucene42/Lucene42DocValuesProducer.java | 9 + .../perfield/PerFieldDocValuesFormat.java | 7 + .../org/apache/lucene/index/AtomicReader.java | 6 + .../lucene/index/BinaryDocValuesWriter.java | 34 ++- .../org/apache/lucene/index/CheckIndex.java | 79 +++++-- .../lucene/index/DocValuesProcessor.java | 2 +- .../lucene/index/FilterAtomicReader.java | 6 + .../apache/lucene/index/MultiDocValues.java | 46 ++++ .../lucene/index/NormsConsumerPerField.java | 2 +- .../lucene/index/NumericDocValuesWriter.java | 32 ++- .../lucene/index/ParallelAtomicReader.java | 7 + .../lucene/index/SegmentCoreReaders.java | 36 ++- .../apache/lucene/index/SegmentMerger.java | 17 +- .../apache/lucene/index/SegmentReader.java | 6 + .../index/SlowCompositeReaderWrapper.java | 6 + .../apache/lucene/index/SortedDocValues.java | 7 +- .../lucene/index/SortedDocValuesWriter.java | 18 +- .../org/apache/lucene/search/FieldCache.java | 25 +-- .../apache/lucene/search/FieldCacheImpl.java | 7 +- .../apache/lucene/search/TestFieldCache.java | 2 +- .../facet42/Facet42DocValuesProducer.java | 8 + .../util/FacetsPayloadMigrationReader.java | 13 ++ .../highlight/WeightedSpanTermExtractor.java | 5 + .../lucene/index/memory/MemoryIndex.java | 5 + .../index/sorter/SortingAtomicReader.java | 46 ++-- .../asserting/AssertingDocValuesFormat.java | 38 ++-- .../CheapBastardDocValuesProducer.java | 13 ++ .../lucene40/Lucene40DocValuesWriter.java | 62 ++++-- .../index/BaseDocValuesFormatTestCase.java | 207 +++++++++++++++++- .../lucene/index/FieldFilterAtomicReader.java | 6 + .../apache/lucene/util/LuceneTestCase.java | 7 + .../org/apache/lucene/util/_TestUtil.java | 10 + .../apache/solr/request/DocValuesFacets.java | 1 + .../org/apache/solr/search/TestDocSet.java | 5 + 44 files changed, 1107 insertions(+), 173 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/MissingOrdRemapper.java diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java index 2d4853a66b0..b5124871237 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java @@ -23,6 +23,7 @@ import java.util.HashSet; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.MissingOrdRemapper; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; @@ -92,8 +93,9 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { if (optimizeStorage) { uniqueValues = new HashSet<>(); + // nocommit: impl null values (ideally smartly) for (Number nv : values) { - final long v = nv.longValue(); + final long v = nv == null ? 0 : nv.longValue(); if (gcd != 1) { if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { @@ -151,14 +153,15 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { meta.writeLong(gcd); final BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE); for (Number nv : values) { - quotientWriter.add((nv.longValue() - minValue) / gcd); + long value = nv == null ? 0 : nv.longValue(); + quotientWriter.add((value - minValue) / gcd); } quotientWriter.finish(); break; case DELTA_COMPRESSED: final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); for (Number nv : values) { - writer.add(nv.longValue()); + writer.add(nv == null ? 0 : nv.longValue()); } writer.finish(); break; @@ -173,7 +176,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { final int bitsRequired = PackedInts.bitsRequired(uniqueValues.size() - 1); final PackedInts.Writer ordsWriter = PackedInts.getWriterNoHeader(data, PackedInts.Format.PACKED, (int) count, bitsRequired, PackedInts.DEFAULT_BUFFER_SIZE); for (Number nv : values) { - ordsWriter.add(encode.get(nv.longValue())); + ordsWriter.add(encode.get(nv == null ? 0 : nv.longValue())); } ordsWriter.finish(); break; @@ -192,9 +195,12 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { final long startFP = data.getFilePointer(); long count = 0; for(BytesRef v : values) { - minLength = Math.min(minLength, v.length); - maxLength = Math.max(maxLength, v.length); - data.writeBytes(v.bytes, v.offset, v.length); + final int length = v == null ? 0 : v.length; + minLength = Math.min(minLength, length); + maxLength = Math.max(maxLength, length); + if (v != null) { + data.writeBytes(v.bytes, v.offset, v.length); + } count++; } meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED); @@ -213,7 +219,9 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE); long addr = 0; for (BytesRef v : values) { - addr += v.length; + if (v != null) { + addr += v.length; + } writer.add(addr); } writer.finish(); @@ -278,6 +286,34 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { @Override public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + // nocommit: remove this hack and support missing! + + // three cases for simulating the old writer: + // 1. no missing + // 2. missing (and empty string in use): remap ord=-1 -> ord=0 + // 3. missing (and empty string not in use): remap all ords +1, insert empty string into values + boolean anyMissing = false; + for (Number n : docToOrd) { + if (n.longValue() == -1) { + anyMissing = true; + break; + } + } + + boolean hasEmptyString = false; + for (BytesRef b : values) { + hasEmptyString = b.length == 0; + break; + } + + if (!anyMissing) { + // nothing to do + } else if (hasEmptyString) { + docToOrd = MissingOrdRemapper.mapMissingToOrd0(docToOrd); + } else { + docToOrd = MissingOrdRemapper.mapAllOrds(docToOrd); + values = MissingOrdRemapper.insertEmptyValue(values); + } meta.writeVInt(field.number); meta.writeByte(DiskDocValuesFormat.SORTED); addTermsDict(field, values); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java index c100b84142c..11a60fdaf6c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java @@ -32,6 +32,7 @@ import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.DocValuesProducer.SortedSetDocsWithField; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DocsAndPositionsEnum; @@ -59,6 +60,7 @@ class DiskDocValuesProducer extends DocValuesProducer { private final Map ords; private final Map ordIndexes; private final IndexInput data; + private final int maxDoc; // memory-resident structures private final Map addressInstances = new HashMap(); @@ -68,6 +70,7 @@ class DiskDocValuesProducer extends DocValuesProducer { String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); // read in the entries from the metadata file. IndexInput in = state.directory.openInput(metaName, state.context); + this.maxDoc = state.segmentInfo.getDocCount(); boolean success = false; final int version; try { @@ -490,6 +493,17 @@ class DiskDocValuesProducer extends DocValuesProducer { }; } + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + // nocommit: only use this if the field's entry has missing values (write that), + // otherwise return MatchAllBits + if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) { + return new SortedSetDocsWithField(getSortedSet(field), maxDoc); + } else { + return new Bits.MatchAllBits(maxDoc); + } + } + @Override public void close() throws IOException { data.close(); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java index 02557c95b57..c256367f23f 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java @@ -38,12 +38,16 @@ import org.apache.lucene.index.SegmentWriteState; * minvalue 0 * pattern 000 * 005 + * T * 234 + * T * 123 + * T * ... * * so a document's value (delta encoded from minvalue) can be retrieved by - * seeking to startOffset + (1+pattern.length())*docid. The extra 1 is the newline. + * seeking to startOffset + (1+pattern.length()+2)*docid. The extra 1 is the newline. + * The extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing. * * for bytes this is also a "fixed-width" file, for example: *
@@ -53,12 +57,15 @@ import org.apache.lucene.index.SegmentWriteState;
  *    pattern 0
  *  length 6
  *  foobar[space][space]
+ *  T
  *  length 3
  *  baz[space][space][space][space][space]
+ *  T
  *  ...
  *  
- * so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*doc + * so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc * the extra 9 is 2 newlines, plus "length " itself. + * the extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing. * * for sorted bytes this is a fixed-width file, for example: *
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
index db5ec4e09f3..c625f4ea2e8 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
@@ -28,6 +28,7 @@ import java.util.Locale;
 import java.util.Map;
 
 import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.codecs.DocValuesProducer.SortedSetDocsWithField;
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.FieldInfo;
@@ -38,6 +39,7 @@ import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.FieldInfo.DocValuesType;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.StringHelper;
 
@@ -100,7 +102,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
         assert startsWith(PATTERN);
         field.pattern = stripPrefix(PATTERN);
         field.dataStartFilePointer = data.getFilePointer();
-        data.seek(data.getFilePointer() + (1+field.pattern.length()) * maxDoc);
+        data.seek(data.getFilePointer() + (1+field.pattern.length()+2) * maxDoc);
       } else if (dvType == DocValuesType.BINARY) {
         readLine();
         assert startsWith(MAXLENGTH);
@@ -109,7 +111,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
         assert startsWith(PATTERN);
         field.pattern = stripPrefix(PATTERN);
         field.dataStartFilePointer = data.getFilePointer();
-        data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength) * maxDoc);
+        data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength+2) * maxDoc);
       } else if (dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) {
         readLine();
         assert startsWith(NUMVALUES);
@@ -158,7 +160,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
           if (docID < 0 || docID >= maxDoc) {
             throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
           }
-          in.seek(field.dataStartFilePointer + (1+field.pattern.length())*docID);
+          in.seek(field.dataStartFilePointer + (1+field.pattern.length()+2)*docID);
           SimpleTextUtil.readLine(in, scratch);
           //System.out.println("parsing delta: " + scratch.utf8ToString());
           BigDecimal bd;
@@ -169,6 +171,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
             e.initCause(pe);
             throw e;
           }
+          SimpleTextUtil.readLine(in, scratch); // read the line telling us if its real or not
           return BigInteger.valueOf(field.minValue).add(bd.toBigIntegerExact()).longValue();
         } catch (IOException ioe) {
           throw new RuntimeException(ioe);
@@ -176,6 +179,30 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
       }
     };
   }
+  
+  private Bits getNumericDocsWithField(FieldInfo fieldInfo) throws IOException {
+    final OneField field = fields.get(fieldInfo.name);
+    final IndexInput in = data.clone();
+    final BytesRef scratch = new BytesRef();
+    return new Bits() {
+      @Override
+      public boolean get(int index) {
+        try {
+          in.seek(field.dataStartFilePointer + (1+field.pattern.length()+2)*index);
+          SimpleTextUtil.readLine(in, scratch); // data
+          SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
+          return scratch.bytes[scratch.offset] == (byte) 'T';
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      }
+
+      @Override
+      public int length() {
+        return maxDoc;
+      }
+    };
+  }
 
   @Override
   public BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOException {
@@ -196,7 +223,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
           if (docID < 0 || docID >= maxDoc) {
             throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
           }
-          in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength)*docID);
+          in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength+2)*docID);
           SimpleTextUtil.readLine(in, scratch);
           assert StringHelper.startsWith(scratch, LENGTH);
           int len;
@@ -217,6 +244,45 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
       }
     };
   }
+  
+  private Bits getBinaryDocsWithField(FieldInfo fieldInfo) throws IOException {
+    final OneField field = fields.get(fieldInfo.name);
+    final IndexInput in = data.clone();
+    final BytesRef scratch = new BytesRef();
+    final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
+
+    return new Bits() {
+      @Override
+      public boolean get(int index) {
+        try {
+          in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength+2)*index);
+          SimpleTextUtil.readLine(in, scratch);
+          assert StringHelper.startsWith(scratch, LENGTH);
+          int len;
+          try {
+            len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
+          } catch (ParseException pe) {
+            CorruptIndexException e = new CorruptIndexException("failed to parse int length (resource=" + in + ")");
+            e.initCause(pe);
+            throw e;
+          }
+          // skip past bytes
+          byte bytes[] = new byte[len];
+          in.readBytes(bytes, 0, len);
+          SimpleTextUtil.readLine(in, scratch); // newline
+          SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
+          return scratch.bytes[scratch.offset] == (byte) 'T';
+        } catch (IOException ioe) {
+          throw new RuntimeException(ioe);
+        }
+      }
+
+      @Override
+      public int length() {
+        return maxDoc;
+      }
+    };
+  }
 
   @Override
   public SortedDocValues getSorted(FieldInfo fieldInfo) throws IOException {
@@ -241,7 +307,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
           in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + docID * (1 + field.ordPattern.length()));
           SimpleTextUtil.readLine(in, scratch);
           try {
-            return ordDecoder.parse(scratch.utf8ToString()).intValue();
+            return (int) ordDecoder.parse(scratch.utf8ToString()).longValue()-1;
           } catch (ParseException pe) {
             CorruptIndexException e = new CorruptIndexException("failed to parse ord (resource=" + in + ")");
             e.initCause(pe);
@@ -255,8 +321,12 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
       @Override
       public void lookupOrd(int ord, BytesRef result) {
         try {
-          if (ord < 0 || ord >= field.numValues) {
-            throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord);
+          if (ord == -1) {
+            result.length = 0;
+            return;
+          }
+          if (ord < -1 || ord >= field.numValues) {
+            throw new IndexOutOfBoundsException("ord must be -1 .. " + (field.numValues-1) + "; got " + ord);
           }
           in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
           SimpleTextUtil.readLine(in, scratch);
@@ -362,6 +432,21 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
       }
     };
   }
+  
+  @Override
+  public Bits getDocsWithField(FieldInfo field) throws IOException {
+    if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) {
+      return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
+    } else if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED) {
+      return new SortedDocsWithField(getSorted(field), maxDoc);
+    } else if (field.getDocValuesType() == FieldInfo.DocValuesType.BINARY) {
+      return getBinaryDocsWithField(field);
+    } else if (field.getDocValuesType() == FieldInfo.DocValuesType.NUMERIC) {
+      return getNumericDocsWithField(field);
+    } else {
+      return new Bits.MatchAllBits(maxDoc);
+    }
+  }
 
   @Override
   public void close() throws IOException {
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
index 2f86255cbd6..e5f1e35007e 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
@@ -78,7 +78,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
     long minValue = Long.MAX_VALUE;
     long maxValue = Long.MIN_VALUE;
     for(Number n : values) {
-      long v = n.longValue();
+      long v = n == null ? 0 : n.longValue();
       minValue = Math.min(minValue, v);
       maxValue = Math.max(maxValue, v);
     }
@@ -112,13 +112,19 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
 
     // second pass to write the values
     for(Number n : values) {
-      long value = n.longValue();
+      long value = n == null ? 0 : n.longValue();
       assert value >= minValue;
       Number delta = BigInteger.valueOf(value).subtract(BigInteger.valueOf(minValue));
       String s = encoder.format(delta);
       assert s.length() == patternString.length();
       SimpleTextUtil.write(data, s, scratch);
       SimpleTextUtil.writeNewline(data);
+      if (n == null) {
+        SimpleTextUtil.write(data, "F", scratch);
+      } else {
+        SimpleTextUtil.write(data, "T", scratch);
+      }
+      SimpleTextUtil.writeNewline(data);
       numDocsWritten++;
       assert numDocsWritten <= numDocs;
     }
@@ -132,7 +138,8 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
     assert field.getDocValuesType() == DocValuesType.BINARY;
     int maxLength = 0;
     for(BytesRef value : values) {
-      maxLength = Math.max(maxLength, value.length);
+      final int length = value == null ? 0 : value.length;
+      maxLength = Math.max(maxLength, length);
     }
     writeFieldEntry(field, FieldInfo.DocValuesType.BINARY);
 
@@ -155,19 +162,28 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
     int numDocsWritten = 0;
     for(BytesRef value : values) {
       // write length
+      final int length = value == null ? 0 : value.length;
       SimpleTextUtil.write(data, LENGTH);
-      SimpleTextUtil.write(data, encoder.format(value.length), scratch);
+      SimpleTextUtil.write(data, encoder.format(length), scratch);
       SimpleTextUtil.writeNewline(data);
         
       // write bytes -- don't use SimpleText.write
       // because it escapes:
-      data.writeBytes(value.bytes, value.offset, value.length);
+      if (value != null) {
+        data.writeBytes(value.bytes, value.offset, value.length);
+      }
 
       // pad to fit
-      for (int i = value.length; i < maxLength; i++) {
+      for (int i = length; i < maxLength; i++) {
         data.writeByte((byte)' ');
       }
       SimpleTextUtil.writeNewline(data);
+      if (value == null) {
+        SimpleTextUtil.write(data, "F", scratch);
+      } else {
+        SimpleTextUtil.write(data, "T", scratch);
+      }
+      SimpleTextUtil.writeNewline(data);
       numDocsWritten++;
     }
 
@@ -209,7 +225,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
     SimpleTextUtil.writeNewline(data);
     final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
     
-    int maxOrdBytes = Integer.toString(valueCount).length();
+    int maxOrdBytes = Long.toString(valueCount+1L).length();
     sb.setLength(0);
     for (int i = 0; i < maxOrdBytes; i++) {
       sb.append('0');
@@ -246,7 +262,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
     assert valuesSeen == valueCount;
 
     for(Number ord : docToOrd) {
-      SimpleTextUtil.write(data, ordEncoder.format(ord.intValue()), scratch);
+      SimpleTextUtil.write(data, ordEncoder.format(ord.longValue()+1), scratch);
       SimpleTextUtil.writeNewline(data);
     }
   }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
index 921b94dff69..e0aac6d5cbe 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java
@@ -69,7 +69,8 @@ public abstract class DocValuesConsumer implements Closeable {
   /**
    * Writes numeric docvalues for a field.
    * @param field field information
-   * @param values Iterable of numeric values (one for each document).
+   * @param values Iterable of numeric values (one for each document). {@code null} indicates
+   *               a missing value.
    * @throws IOException if an I/O error occurred.
    */
   public abstract void addNumericField(FieldInfo field, Iterable values) throws IOException;    
@@ -77,7 +78,8 @@ public abstract class DocValuesConsumer implements Closeable {
   /**
    * Writes binary docvalues for a field.
    * @param field field information
-   * @param values Iterable of binary values (one for each document).
+   * @param values Iterable of binary values (one for each document). {@code null} indicates
+   *               a missing value.
    * @throws IOException if an I/O error occurred.
    */
   public abstract void addBinaryField(FieldInfo field, Iterable values) throws IOException;
@@ -86,7 +88,8 @@ public abstract class DocValuesConsumer implements Closeable {
    * Writes pre-sorted binary docvalues for a field.
    * @param field field information
    * @param values Iterable of binary values in sorted order (deduplicated).
-   * @param docToOrd Iterable of ordinals (one for each document).
+   * @param docToOrd Iterable of ordinals (one for each document). {@code -1} indicates
+   *                 a missing value.
    * @throws IOException if an I/O error occurred.
    */
   public abstract void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException;
@@ -95,7 +98,8 @@ public abstract class DocValuesConsumer implements Closeable {
    * Writes pre-sorted set docvalues for a field
    * @param field field information
    * @param values Iterable of binary values in sorted order (deduplicated).
-   * @param docToOrdCount Iterable of the number of values for each document. 
+   * @param docToOrdCount Iterable of the number of values for each document. A zero ordinal
+   *                      count indicates a missing value.
    * @param ords Iterable of ordinal occurrences (docToOrdCount*maxDoc total).
    * @throws IOException if an I/O error occurred.
    */
@@ -107,7 +111,7 @@ public abstract class DocValuesConsumer implements Closeable {
    * The default implementation calls {@link #addNumericField}, passing
    * an Iterable that merges and filters deleted documents on the fly.
    */
-  public void mergeNumericField(FieldInfo fieldInfo, final MergeState mergeState, final List toMerge) throws IOException {
+  public void mergeNumericField(final FieldInfo fieldInfo, final MergeState mergeState, final List toMerge, final List docsWithField) throws IOException {
 
     addNumericField(fieldInfo,
                     new Iterable() {
@@ -116,10 +120,11 @@ public abstract class DocValuesConsumer implements Closeable {
                         return new Iterator() {
                           int readerUpto = -1;
                           int docIDUpto;
-                          long nextValue;
+                          Long nextValue;
                           AtomicReader currentReader;
                           NumericDocValues currentValues;
                           Bits currentLiveDocs;
+                          Bits currentDocsWithField;
                           boolean nextIsSet;
 
                           @Override
@@ -139,7 +144,6 @@ public abstract class DocValuesConsumer implements Closeable {
                             }
                             assert nextIsSet;
                             nextIsSet = false;
-                            // TODO: make a mutable number
                             return nextValue;
                           }
 
@@ -155,6 +159,7 @@ public abstract class DocValuesConsumer implements Closeable {
                                   currentReader = mergeState.readers.get(readerUpto);
                                   currentValues = toMerge.get(readerUpto);
                                   currentLiveDocs = currentReader.getLiveDocs();
+                                  currentDocsWithField = docsWithField.get(readerUpto);
                                 }
                                 docIDUpto = 0;
                                 continue;
@@ -162,7 +167,11 @@ public abstract class DocValuesConsumer implements Closeable {
 
                               if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
                                 nextIsSet = true;
-                                nextValue = currentValues.get(docIDUpto);
+                                if (currentDocsWithField.get(docIDUpto)) {
+                                  nextValue = currentValues.get(docIDUpto);
+                                } else {
+                                  nextValue = null;
+                                }
                                 docIDUpto++;
                                 return true;
                               }
@@ -181,7 +190,7 @@ public abstract class DocValuesConsumer implements Closeable {
    * The default implementation calls {@link #addBinaryField}, passing
    * an Iterable that merges and filters deleted documents on the fly.
    */
-  public void mergeBinaryField(FieldInfo fieldInfo, final MergeState mergeState, final List toMerge) throws IOException {
+  public void mergeBinaryField(FieldInfo fieldInfo, final MergeState mergeState, final List toMerge, final List docsWithField) throws IOException {
 
     addBinaryField(fieldInfo,
                    new Iterable() {
@@ -191,9 +200,11 @@ public abstract class DocValuesConsumer implements Closeable {
                          int readerUpto = -1;
                          int docIDUpto;
                          BytesRef nextValue = new BytesRef();
+                         BytesRef nextPointer; // points to null if missing, or nextValue
                          AtomicReader currentReader;
                          BinaryDocValues currentValues;
                          Bits currentLiveDocs;
+                         Bits currentDocsWithField;
                          boolean nextIsSet;
 
                          @Override
@@ -213,8 +224,7 @@ public abstract class DocValuesConsumer implements Closeable {
                            }
                            assert nextIsSet;
                            nextIsSet = false;
-                           // TODO: make a mutable number
-                           return nextValue;
+                           return nextPointer;
                          }
 
                          private boolean setNext() {
@@ -228,6 +238,7 @@ public abstract class DocValuesConsumer implements Closeable {
                                if (readerUpto < toMerge.size()) {
                                  currentReader = mergeState.readers.get(readerUpto);
                                  currentValues = toMerge.get(readerUpto);
+                                 currentDocsWithField = docsWithField.get(readerUpto);
                                  currentLiveDocs = currentReader.getLiveDocs();
                                }
                                docIDUpto = 0;
@@ -236,7 +247,12 @@ public abstract class DocValuesConsumer implements Closeable {
 
                              if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
                                nextIsSet = true;
-                               currentValues.get(docIDUpto, nextValue);
+                               if (currentDocsWithField.get(docIDUpto)) {
+                                 currentValues.get(docIDUpto, nextValue);
+                                 nextPointer = nextValue;
+                               } else {
+                                 nextPointer = null;
+                               }
                                docIDUpto++;
                                return true;
                              }
@@ -272,7 +288,10 @@ public abstract class DocValuesConsumer implements Closeable {
         OpenBitSet bitset = new OpenBitSet(dv.getValueCount());
         for (int i = 0; i < reader.maxDoc(); i++) {
           if (liveDocs.get(i)) {
-            bitset.set(dv.getOrd(i));
+            int ord = dv.getOrd(i);
+            if (ord >= 0) {
+              bitset.set(ord);
+            }
           }
         }
         liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
@@ -368,7 +387,7 @@ public abstract class DocValuesConsumer implements Closeable {
                   if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
                     nextIsSet = true;
                     int segOrd = dvs[readerUpto].getOrd(docIDUpto);
-                    nextValue = (int) map.getGlobalOrd(readerUpto, segOrd);
+                    nextValue = segOrd == -1 ? -1 : (int) map.getGlobalOrd(readerUpto, segOrd);
                     docIDUpto++;
                     return true;
                   }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
index b2c5d549d27..04778aa1201 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java
@@ -25,6 +25,7 @@ import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.util.Bits;
 
 /** Abstract API that produces numeric, binary and
  * sorted docvalues.
@@ -56,4 +57,65 @@ public abstract class DocValuesProducer implements Closeable {
    *  The returned instance need not be thread-safe: it will only be
    *  used by a single thread. */
   public abstract SortedSetDocValues getSortedSet(FieldInfo field) throws IOException;
+  
+  /** Returns a {@link Bits} at the size of reader.maxDoc(), 
+   *  with turned on bits for each docid that does have a value for this field.
+   *  The returned instance need not be thread-safe: it will only be
+   *  used by a single thread. */
+  public abstract Bits getDocsWithField(FieldInfo field) throws IOException;
+  
+  /** 
+   * A simple implementation of {@link DocValuesProducer#getDocsWithField} that 
+   * returns {@code true} if a document has an ordinal >= 0
+   * 

+ * Codecs can choose to use this (or implement it more efficiently another way), but + * in most cases a Bits is unnecessary anyway: users can check this as they go. + */ + public static class SortedDocsWithField implements Bits { + final SortedDocValues in; + final int maxDoc; + + public SortedDocsWithField(SortedDocValues in, int maxDoc) { + this.in = in; + this.maxDoc = maxDoc; + } + + @Override + public boolean get(int index) { + return in.getOrd(index) >= 0; + } + + @Override + public int length() { + return maxDoc; + } + } + + /** + * A simple implementation of {@link DocValuesProducer#getDocsWithField} that + * returns {@code true} if a document has any ordinals. + *

+ * Codecs can choose to use this (or implement it more efficiently another way), but + * in most cases a Bits is unnecessary anyway: users can check this as they go. + */ + public static class SortedSetDocsWithField implements Bits { + final SortedSetDocValues in; + final int maxDoc; + + public SortedSetDocsWithField(SortedSetDocValues in, int maxDoc) { + this.in = in; + this.maxDoc = maxDoc; + } + + @Override + public boolean get(int index) { + in.setDocument(index); + return in.nextOrd() != SortedSetDocValues.NO_MORE_ORDS; + } + + @Override + public int length() { + return maxDoc; + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/MissingOrdRemapper.java b/lucene/core/src/java/org/apache/lucene/codecs/MissingOrdRemapper.java new file mode 100644 index 00000000000..61e17598bcb --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/MissingOrdRemapper.java @@ -0,0 +1,124 @@ +package org.apache.lucene.codecs; + +import java.util.Iterator; + +import org.apache.lucene.util.BytesRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * a utility class to write missing values for SORTED_SET as if they were the empty string + * (to simulate pre-Lucene4.5 dv behavior for testing old codecs) + */ +// nocommit: move this to test-framework with all the impersonators of +// these old codecs once new memory/disk codecs are written that support missing +public class MissingOrdRemapper { + + /** insert an empty byte[] to the front of this iterable */ + public static Iterable insertEmptyValue(final Iterable iterable) { + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + boolean seenEmpty = false; + Iterator in = iterable.iterator(); + + @Override + public boolean hasNext() { + return !seenEmpty || in.hasNext(); + } + + @Override + public BytesRef next() { + if (!seenEmpty) { + seenEmpty = true; + return new BytesRef(); + } else { + return in.next(); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + }; + } + + /** remaps ord -1 to ord 0 on this iterable. */ + public static Iterable mapMissingToOrd0(final Iterable iterable) { + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + Iterator in = iterable.iterator(); + + @Override + public boolean hasNext() { + return in.hasNext(); + } + + @Override + public Number next() { + Number n = in.next(); + if (n.longValue() == -1) { + return 0; + } else { + return n; + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + }; + } + + /** remaps every ord+1 on this iterable */ + public static Iterable mapAllOrds(final Iterable iterable) { + return new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + Iterator in = iterable.iterator(); + + @Override + public boolean hasNext() { + return in.hasNext(); + } + + @Override + public Number next() { + Number n = in.next(); + return n.longValue()+1; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + }; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java index 21a082c1893..54617702c07 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesReader.java @@ -35,6 +35,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.PagedBytes; @@ -620,6 +621,11 @@ final class Lucene40DocValuesReader extends DocValuesProducer { throw new IllegalStateException("Lucene 4.0 does not support SortedSet: how did you pull this off?"); } + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + return new Bits.MatchAllBits(state.segmentInfo.getDocCount()); + } + @Override public void close() throws IOException { dir.close(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java index a1f6dc47d82..edd3bbd240a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java @@ -25,6 +25,7 @@ import java.util.NoSuchElementException; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.MissingOrdRemapper; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; @@ -106,7 +107,8 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { long count = 0; for (Number nv : values) { - final long v = nv.longValue(); + // TODO: support this as MemoryDVFormat (and be smart about missing maybe) + final long v = nv == null ? 0 : nv.longValue(); if (gcd != 1) { if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { @@ -142,7 +144,7 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) { meta.writeByte(UNCOMPRESSED); // uncompressed for (Number nv : values) { - data.writeByte((byte) nv.longValue()); + data.writeByte(nv == null ? 0 : (byte) nv.longValue()); } } else { meta.writeByte(TABLE_COMPRESSED); // table-compressed @@ -160,7 +162,7 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); for(Number nv : values) { - writer.add(encode.get(nv.longValue())); + writer.add(encode.get(nv == null ? 0 : nv.longValue())); } writer.finish(); } @@ -173,7 +175,8 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); for (Number nv : values) { - writer.add((nv.longValue() - minValue) / gcd); + long value = nv == null ? 0 : nv.longValue(); + writer.add((value - minValue) / gcd); } writer.finish(); } else { @@ -184,7 +187,7 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); for (Number nv : values) { - writer.add(nv.longValue()); + writer.add(nv == null ? 0 : nv.longValue()); } writer.finish(); } @@ -216,9 +219,12 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { int maxLength = Integer.MIN_VALUE; final long startFP = data.getFilePointer(); for(BytesRef v : values) { - minLength = Math.min(minLength, v.length); - maxLength = Math.max(maxLength, v.length); - data.writeBytes(v.bytes, v.offset, v.length); + final int length = v == null ? 0 : v.length; + minLength = Math.min(minLength, length); + maxLength = Math.max(maxLength, length); + if (v != null) { + data.writeBytes(v.bytes, v.offset, v.length); + } } meta.writeLong(startFP); meta.writeLong(data.getFilePointer() - startFP); @@ -234,7 +240,9 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE); long addr = 0; for (BytesRef v : values) { - addr += v.length; + if (v != null) { + addr += v.length; + } writer.add(addr); } writer.finish(); @@ -262,6 +270,33 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { @Override public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + // three cases for simulating the old writer: + // 1. no missing + // 2. missing (and empty string in use): remap ord=-1 -> ord=0 + // 3. missing (and empty string not in use): remap all ords +1, insert empty string into values + boolean anyMissing = false; + for (Number n : docToOrd) { + if (n.longValue() == -1) { + anyMissing = true; + break; + } + } + + boolean hasEmptyString = false; + for (BytesRef b : values) { + hasEmptyString = b.length == 0; + break; + } + + if (!anyMissing) { + // nothing to do + } else if (hasEmptyString) { + docToOrd = MissingOrdRemapper.mapMissingToOrd0(docToOrd); + } else { + docToOrd = MissingOrdRemapper.mapAllOrds(docToOrd); + values = MissingOrdRemapper.insertEmptyValue(values); + } + // write the ordinals as numerics addNumericField(field, docToOrd, false); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java index 16ecf187b93..c2a95fb8195 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java @@ -429,6 +429,15 @@ class Lucene42DocValuesProducer extends DocValuesProducer { } }; } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) { + return new SortedSetDocsWithField(getSortedSet(field), maxDoc); + } else { + return new Bits.MatchAllBits(maxDoc); + } + } @Override public void close() throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java index 72053a8b3a1..3ed6797e783 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java @@ -36,6 +36,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; @@ -265,6 +266,12 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { DocValuesProducer producer = fields.get(field.name); return producer == null ? null : producer.getSortedSet(field); } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + DocValuesProducer producer = fields.get(field.name); + return producer == null ? null : producer.getDocsWithField(field); + } @Override public void close() throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java b/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java index 1b0e4168969..a3e28e85c11 100644 --- a/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/AtomicReader.java @@ -207,6 +207,12 @@ public abstract class AtomicReader extends IndexReader { * this field. The returned instance should only be * used by a single thread. */ public abstract SortedSetDocValues getSortedSetDocValues(String field) throws IOException; + + /** Returns a {@link Bits} at the size of reader.maxDoc(), + * with turned on bits for each docid that does have a value for this field, + * or null if no DocValues were indexed for this field. The + * returned instance should only be used by a single thread */ + public abstract Bits getDocsWithField(String field) throws IOException; /** Returns {@link NumericDocValues} representing norms * for this field, or null if no {@link NumericDocValues} diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java index 643408ff9b0..553f9ff1ff7 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java @@ -26,6 +26,8 @@ import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Counter; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer; import org.apache.lucene.util.packed.PackedInts; @@ -38,6 +40,9 @@ class BinaryDocValuesWriter extends DocValuesWriter { private final ByteBlockPool pool; private final AppendingDeltaPackedLongBuffer lengths; + private final OpenBitSet docsWithField; + private final Counter iwBytesUsed; + private long bytesUsed; private final FieldInfo fieldInfo; private int addedValues = 0; @@ -45,6 +50,10 @@ class BinaryDocValuesWriter extends DocValuesWriter { this.fieldInfo = fieldInfo; this.pool = new ByteBlockPool(new DirectTrackingAllocator(iwBytesUsed)); this.lengths = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); + this.iwBytesUsed = iwBytesUsed; + this.docsWithField = new OpenBitSet(); + this.bytesUsed = docsWithFieldBytesUsed(); + iwBytesUsed.addAndGet(bytesUsed); } public void addValue(int docID, BytesRef value) { @@ -66,6 +75,19 @@ class BinaryDocValuesWriter extends DocValuesWriter { addedValues++; lengths.add(value.length); pool.append(value); + docsWithField.set(docID); + updateBytesUsed(); + } + + private long docsWithFieldBytesUsed() { + // nocommit: this is not correct + return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG; + } + + private void updateBytesUsed() { + final long newBytesUsed = docsWithFieldBytesUsed(); + iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); + bytesUsed = newBytesUsed; } @Override @@ -111,19 +133,23 @@ class BinaryDocValuesWriter extends DocValuesWriter { if (!hasNext()) { throw new NoSuchElementException(); } + final BytesRef v; if (upto < size) { int length = (int) lengthsIterator.next(); value.grow(length); value.length = length; pool.readBytes(byteOffset, value.bytes, value.offset, value.length); byteOffset += length; + if (docsWithField.get(upto)) { + v = value; + } else { + v = null; + } } else { - // This is to handle last N documents not having - // this DV field in the end of the segment: - value.length = 0; + v = null; } upto++; - return value; + return v; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 27a1aface60..5a702d29e39 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1280,7 +1280,8 @@ public class CheckIndex { if (reader.getBinaryDocValues(fieldInfo.name) != null || reader.getNumericDocValues(fieldInfo.name) != null || reader.getSortedDocValues(fieldInfo.name) != null || - reader.getSortedSetDocValues(fieldInfo.name) != null) { + reader.getSortedSetDocValues(fieldInfo.name) != null || + reader.getDocsWithField(fieldInfo.name) != null) { throw new RuntimeException("field: " + fieldInfo.name + " has docvalues but should omit them!"); } } @@ -1301,26 +1302,37 @@ public class CheckIndex { return status; } - private static void checkBinaryDocValues(String fieldName, AtomicReader reader, BinaryDocValues dv) { + private static void checkBinaryDocValues(String fieldName, AtomicReader reader, BinaryDocValues dv, Bits docsWithField) { BytesRef scratch = new BytesRef(); for (int i = 0; i < reader.maxDoc(); i++) { dv.get(i, scratch); assert scratch.isValid(); + if (docsWithField.get(i) == false && scratch.length > 0) { + throw new RuntimeException("dv for field: " + fieldName + " is missing but has value=" + scratch + " for doc: " + i); + } } } - private static void checkSortedDocValues(String fieldName, AtomicReader reader, SortedDocValues dv) { - checkBinaryDocValues(fieldName, reader, dv); + private static void checkSortedDocValues(String fieldName, AtomicReader reader, SortedDocValues dv, Bits docsWithField) { + checkBinaryDocValues(fieldName, reader, dv, docsWithField); final int maxOrd = dv.getValueCount()-1; FixedBitSet seenOrds = new FixedBitSet(dv.getValueCount()); int maxOrd2 = -1; for (int i = 0; i < reader.maxDoc(); i++) { int ord = dv.getOrd(i); - if (ord < 0 || ord > maxOrd) { + if (ord == -1) { + if (docsWithField.get(i)) { + throw new RuntimeException("dv for field: " + fieldName + " has -1 ord but is not marked missing for doc: " + i); + } + } else if (ord < -1 || ord > maxOrd) { throw new RuntimeException("ord out of bounds: " + ord); + } else { + if (!docsWithField.get(i)) { + throw new RuntimeException("dv for field: " + fieldName + " is missing but has ord=" + ord + " for doc: " + i); + } + maxOrd2 = Math.max(maxOrd2, ord); + seenOrds.set(ord); } - maxOrd2 = Math.max(maxOrd2, ord); - seenOrds.set(ord); } if (maxOrd != maxOrd2) { throw new RuntimeException("dv for field: " + fieldName + " reports wrong maxOrd=" + maxOrd + " but this is not the case: " + maxOrd2); @@ -1342,7 +1354,7 @@ public class CheckIndex { } } - private static void checkSortedSetDocValues(String fieldName, AtomicReader reader, SortedSetDocValues dv) { + private static void checkSortedSetDocValues(String fieldName, AtomicReader reader, SortedSetDocValues dv, Bits docsWithField) { final long maxOrd = dv.getValueCount()-1; OpenBitSet seenOrds = new OpenBitSet(dv.getValueCount()); long maxOrd2 = -1; @@ -1350,16 +1362,28 @@ public class CheckIndex { dv.setDocument(i); long lastOrd = -1; long ord; - while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { - if (ord <= lastOrd) { - throw new RuntimeException("ords out of order: " + ord + " <= " + lastOrd + " for doc: " + i); + if (docsWithField.get(i)) { + int ordCount = 0; + while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + ordCount++; + if (ord <= lastOrd) { + throw new RuntimeException("ords out of order: " + ord + " <= " + lastOrd + " for doc: " + i); + } + if (ord < 0 || ord > maxOrd) { + throw new RuntimeException("ord out of bounds: " + ord); + } + lastOrd = ord; + maxOrd2 = Math.max(maxOrd2, ord); + seenOrds.set(ord); } - if (ord < 0 || ord > maxOrd) { - throw new RuntimeException("ord out of bounds: " + ord); + if (ordCount == 0) { + throw new RuntimeException("dv for field: " + fieldName + " has no ordinals but is not marked missing for doc: " + i); + } + } else { + long o = dv.nextOrd(); + if (o != SortedSetDocValues.NO_MORE_ORDS) { + throw new RuntimeException("dv for field: " + fieldName + " is marked missing but has ord=" + o + " for doc: " + i); } - lastOrd = ord; - maxOrd2 = Math.max(maxOrd2, ord); - seenOrds.set(ord); } } if (maxOrd != maxOrd2) { @@ -1383,17 +1407,26 @@ public class CheckIndex { } } - private static void checkNumericDocValues(String fieldName, AtomicReader reader, NumericDocValues ndv) { + private static void checkNumericDocValues(String fieldName, AtomicReader reader, NumericDocValues ndv, Bits docsWithField) { for (int i = 0; i < reader.maxDoc(); i++) { - ndv.get(i); + long value = ndv.get(i); + if (docsWithField.get(i) == false && value > 0) { + throw new RuntimeException("dv for field: " + fieldName + " is marked missing but has value=" + value + " for doc: " + i); + } } } private static void checkDocValues(FieldInfo fi, AtomicReader reader, PrintStream infoStream, DocValuesStatus status) throws Exception { + Bits docsWithField = reader.getDocsWithField(fi.name); + if (docsWithField == null) { + throw new RuntimeException(fi.name + " docsWithField does not exist"); + } else if (docsWithField.length() != reader.maxDoc()) { + throw new RuntimeException(fi.name + " docsWithField has incorrect length: " + docsWithField.length() + ",expected: " + reader.maxDoc()); + } switch(fi.getDocValuesType()) { case SORTED: status.totalSortedFields++; - checkSortedDocValues(fi.name, reader, reader.getSortedDocValues(fi.name)); + checkSortedDocValues(fi.name, reader, reader.getSortedDocValues(fi.name), docsWithField); if (reader.getBinaryDocValues(fi.name) != null || reader.getNumericDocValues(fi.name) != null || reader.getSortedSetDocValues(fi.name) != null) { @@ -1402,7 +1435,7 @@ public class CheckIndex { break; case SORTED_SET: status.totalSortedSetFields++; - checkSortedSetDocValues(fi.name, reader, reader.getSortedSetDocValues(fi.name)); + checkSortedSetDocValues(fi.name, reader, reader.getSortedSetDocValues(fi.name), docsWithField); if (reader.getBinaryDocValues(fi.name) != null || reader.getNumericDocValues(fi.name) != null || reader.getSortedDocValues(fi.name) != null) { @@ -1411,7 +1444,7 @@ public class CheckIndex { break; case BINARY: status.totalBinaryFields++; - checkBinaryDocValues(fi.name, reader, reader.getBinaryDocValues(fi.name)); + checkBinaryDocValues(fi.name, reader, reader.getBinaryDocValues(fi.name), docsWithField); if (reader.getNumericDocValues(fi.name) != null || reader.getSortedDocValues(fi.name) != null || reader.getSortedSetDocValues(fi.name) != null) { @@ -1420,7 +1453,7 @@ public class CheckIndex { break; case NUMERIC: status.totalNumericFields++; - checkNumericDocValues(fi.name, reader, reader.getNumericDocValues(fi.name)); + checkNumericDocValues(fi.name, reader, reader.getNumericDocValues(fi.name), docsWithField); if (reader.getBinaryDocValues(fi.name) != null || reader.getSortedDocValues(fi.name) != null || reader.getSortedSetDocValues(fi.name) != null) { @@ -1435,7 +1468,7 @@ public class CheckIndex { private static void checkNorms(FieldInfo fi, AtomicReader reader, PrintStream infoStream) throws IOException { switch(fi.getNormType()) { case NUMERIC: - checkNumericDocValues(fi.name, reader, reader.getNormValues(fi.name)); + checkNumericDocValues(fi.name, reader, reader.getNormValues(fi.name), new Bits.MatchAllBits(reader.maxDoc())); break; default: throw new AssertionError("wtf: " + fi.getNormType()); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java index 90f2e4514f6..cb1b30154dd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesProcessor.java @@ -143,7 +143,7 @@ final class DocValuesProcessor extends StoredFieldsConsumer { DocValuesWriter writer = writers.get(fieldInfo.name); NumericDocValuesWriter numericWriter; if (writer == null) { - numericWriter = new NumericDocValuesWriter(fieldInfo, bytesUsed); + numericWriter = new NumericDocValuesWriter(fieldInfo, bytesUsed, true); writers.put(fieldInfo.name, numericWriter); } else if (!(writer instanceof NumericDocValuesWriter)) { throw new IllegalArgumentException("Incompatible DocValues type: field \"" + fieldInfo.name + "\" changed from " + getTypeDesc(writer) + " to numeric"); diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java index 93be7a66105..4a8a55a3433 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java @@ -414,4 +414,10 @@ public class FilterAtomicReader extends AtomicReader { return in.getNormValues(field); } + @Override + public Bits getDocsWithField(String field) throws IOException { + ensureOpen(); + return in.getDocsWithField(field); + } + } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java index 8f262a8bb5e..1e6671ea05c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java @@ -22,6 +22,7 @@ import java.util.List; import org.apache.lucene.index.MultiTermsEnum.TermsEnumIndex; import org.apache.lucene.index.MultiTermsEnum.TermsEnumWithSlice; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.packed.AppendingPackedLongBuffer; import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; @@ -135,6 +136,51 @@ public class MultiDocValues { }; } } + + /** Returns a Bits for a reader's docsWithField (potentially merging on-the-fly) + *

+ * This is a slow way to access this bitset. Instead, access them per-segment + * with {@link AtomicReader#getDocsWithField(String)} + *

+ * */ + public static Bits getDocsWithField(final IndexReader r, final String field) throws IOException { + final List leaves = r.leaves(); + final int size = leaves.size(); + if (size == 0) { + return null; + } else if (size == 1) { + return leaves.get(0).reader().getDocsWithField(field); + } + + boolean anyReal = false; + boolean anyMissing = false; + final Bits[] values = new Bits[size]; + final int[] starts = new int[size+1]; + for (int i = 0; i < size; i++) { + AtomicReaderContext context = leaves.get(i); + Bits v = context.reader().getDocsWithField(field); + if (v == null) { + v = new Bits.MatchNoBits(context.reader().maxDoc()); + anyMissing = true; + } else { + anyReal = true; + if (v instanceof Bits.MatchAllBits == false) { + anyMissing = true; + } + } + values[i] = v; + starts[i] = context.docBase; + } + starts[size] = r.maxDoc(); + + if (!anyReal) { + return null; + } else if (!anyMissing) { + return new Bits.MatchAllBits(r.maxDoc()); + } else { + return new MultiBits(values, starts, false); + } + } /** Returns a BinaryDocValues for a reader's docvalues (potentially merging on-the-fly) *

diff --git a/lucene/core/src/java/org/apache/lucene/index/NormsConsumerPerField.java b/lucene/core/src/java/org/apache/lucene/index/NormsConsumerPerField.java index 4a3219eaa23..724c9ed05e0 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NormsConsumerPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/NormsConsumerPerField.java @@ -44,7 +44,7 @@ final class NormsConsumerPerField extends InvertedDocEndConsumerPerField impleme if (fieldInfo.isIndexed() && !fieldInfo.omitsNorms()) { if (consumer == null) { fieldInfo.setNormValueType(FieldInfo.DocValuesType.NUMERIC); - consumer = new NumericDocValuesWriter(fieldInfo, docState.docWriter.bytesUsed); + consumer = new NumericDocValuesWriter(fieldInfo, docState.docWriter.bytesUsed, false); } consumer.addValue(docState.docID, similarity.computeNorm(fieldState)); } diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java index cc070830927..7c5aa83fdae 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java @@ -23,6 +23,8 @@ import java.util.NoSuchElementException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.util.Counter; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer; import org.apache.lucene.util.packed.PackedInts; @@ -35,14 +37,18 @@ class NumericDocValuesWriter extends DocValuesWriter { private AppendingDeltaPackedLongBuffer pending; private final Counter iwBytesUsed; private long bytesUsed; + private final OpenBitSet docsWithField; private final FieldInfo fieldInfo; + private final boolean trackDocsWithField; - public NumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { + public NumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed, boolean trackDocsWithField) { pending = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); - bytesUsed = pending.ramBytesUsed(); + docsWithField = new OpenBitSet(); + bytesUsed = pending.ramBytesUsed() + docsWithFieldBytesUsed(); this.fieldInfo = fieldInfo; this.iwBytesUsed = iwBytesUsed; iwBytesUsed.addAndGet(bytesUsed); + this.trackDocsWithField = trackDocsWithField; } public void addValue(int docID, long value) { @@ -56,12 +62,20 @@ class NumericDocValuesWriter extends DocValuesWriter { } pending.add(value); + if (trackDocsWithField) { + docsWithField.set(docID); + } updateBytesUsed(); } + + private long docsWithFieldBytesUsed() { + // nocommit: this is not correct + return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG; + } private void updateBytesUsed() { - final long newBytesUsed = pending.ramBytesUsed(); + final long newBytesUsed = pending.ramBytesUsed() + docsWithFieldBytesUsed(); iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); bytesUsed = newBytesUsed; } @@ -109,14 +123,18 @@ class NumericDocValuesWriter extends DocValuesWriter { if (!hasNext()) { throw new NoSuchElementException(); } - long value; + Long value; if (upto < size) { - value = iter.next(); + long v = iter.next(); + if (!trackDocsWithField || docsWithField.get(upto)) { + value = v; + } else { + value = null; + } } else { - value = 0; + value = trackDocsWithField ? null : MISSING; } upto++; - // TODO: make reusable Number return value; } diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java index c7174d8e7b5..cbc4bbd7f3e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelAtomicReader.java @@ -285,6 +285,13 @@ public class ParallelAtomicReader extends AtomicReader { return reader == null ? null : reader.getSortedSetDocValues(field); } + @Override + public Bits getDocsWithField(String field) throws IOException { + ensureOpen(); + AtomicReader reader = fieldToReader.get(field); + return reader == null ? null : reader.getDocsWithField(field); + } + @Override public NumericDocValues getNormValues(String field) throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java index ab0348293a9..3a526aad54f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java @@ -36,6 +36,7 @@ import org.apache.lucene.index.SegmentReader.CoreClosedListener; import org.apache.lucene.store.CompoundFileDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.util.IOUtils; @@ -87,6 +88,13 @@ final class SegmentCoreReaders { return new HashMap(); } }; + + final CloseableThreadLocal> docsWithFieldLocal = new CloseableThreadLocal>() { + @Override + protected Map initialValue() { + return new HashMap(); + } + }; final CloseableThreadLocal> normsLocal = new CloseableThreadLocal>() { @Override @@ -274,6 +282,30 @@ final class SegmentCoreReaders { return dvs; } + + Bits getDocsWithField(String field) throws IOException { + FieldInfo fi = fieldInfos.fieldInfo(field); + if (fi == null) { + // Field does not exist + return null; + } + if (fi.getDocValuesType() == null) { + // Field was not indexed with doc values + return null; + } + + assert dvProducer != null; + + Map dvFields = docsWithFieldLocal.get(); + + Bits dvs = dvFields.get(field); + if (dvs == null) { + dvs = dvProducer.getDocsWithField(fi); + dvFields.put(field, dvs); + } + + return dvs; + } NumericDocValues getNormValues(String field) throws IOException { FieldInfo fi = fieldInfos.fieldInfo(field); @@ -300,8 +332,8 @@ final class SegmentCoreReaders { void decRef() throws IOException { if (ref.decrementAndGet() == 0) { - IOUtils.close(termVectorsLocal, fieldsReaderLocal, docValuesLocal, normsLocal, fields, dvProducer, - termVectorsReaderOrig, fieldsReaderOrig, cfsReader, normsProducer); + IOUtils.close(termVectorsLocal, fieldsReaderLocal, docValuesLocal, normsLocal, docsWithFieldLocal, fields, + dvProducer, termVectorsReaderOrig, fieldsReaderOrig, cfsReader, normsProducer); notifyCoreClosedListeners(); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index f121e85b10f..718687bcc85 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -30,6 +30,7 @@ import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.index.FieldInfo.DocValuesType; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; @@ -156,24 +157,32 @@ final class SegmentMerger { if (type != null) { if (type == DocValuesType.NUMERIC) { List toMerge = new ArrayList(); + List docsWithField = new ArrayList(); for (AtomicReader reader : mergeState.readers) { NumericDocValues values = reader.getNumericDocValues(field.name); + Bits bits = reader.getDocsWithField(field.name); if (values == null) { values = NumericDocValues.EMPTY; + bits = new Bits.MatchNoBits(reader.maxDoc()); } toMerge.add(values); + docsWithField.add(bits); } - consumer.mergeNumericField(field, mergeState, toMerge); + consumer.mergeNumericField(field, mergeState, toMerge, docsWithField); } else if (type == DocValuesType.BINARY) { List toMerge = new ArrayList(); + List docsWithField = new ArrayList(); for (AtomicReader reader : mergeState.readers) { BinaryDocValues values = reader.getBinaryDocValues(field.name); + Bits bits = reader.getDocsWithField(field.name); if (values == null) { values = BinaryDocValues.EMPTY; + bits = new Bits.MatchNoBits(reader.maxDoc()); } toMerge.add(values); + docsWithField.add(bits); } - consumer.mergeBinaryField(field, mergeState, toMerge); + consumer.mergeBinaryField(field, mergeState, toMerge, docsWithField); } else if (type == DocValuesType.SORTED) { List toMerge = new ArrayList(); for (AtomicReader reader : mergeState.readers) { @@ -216,14 +225,16 @@ final class SegmentMerger { for (FieldInfo field : mergeState.fieldInfos) { if (field.hasNorms()) { List toMerge = new ArrayList(); + List docsWithField = new ArrayList(); for (AtomicReader reader : mergeState.readers) { NumericDocValues norms = reader.getNormValues(field.name); if (norms == null) { norms = NumericDocValues.EMPTY; } toMerge.add(norms); + docsWithField.add(new Bits.MatchAllBits(reader.maxDoc())); } - consumer.mergeNumericField(field, mergeState, toMerge); + consumer.mergeNumericField(field, mergeState, toMerge, docsWithField); } } success = true; diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java index 8214a980cd9..c6cf702955b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java @@ -223,6 +223,12 @@ public final class SegmentReader extends AtomicReader { return core.getNumericDocValues(field); } + @Override + public Bits getDocsWithField(String field) throws IOException { + ensureOpen(); + return core.getDocsWithField(field); + } + @Override public BinaryDocValues getBinaryDocValues(String field) throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java index bce0ef64257..b7af7d0ba88 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeReaderWrapper.java @@ -91,6 +91,12 @@ public final class SlowCompositeReaderWrapper extends AtomicReader { return MultiDocValues.getNumericValues(in, field); } + @Override + public Bits getDocsWithField(String field) throws IOException { + ensureOpen(); + return MultiDocValues.getDocsWithField(in, field); + } + @Override public BinaryDocValues getBinaryDocValues(String field) throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java index c7dae5b3dd9..1968a791157 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java @@ -37,7 +37,8 @@ public abstract class SortedDocValues extends BinaryDocValues { * Returns the ordinal for the specified docID. * @param docID document ID to lookup * @return ordinal for the document: this is dense, starts at 0, then - * increments by 1 for the next value in sorted order. + * increments by 1 for the next value in sorted order. Note that + * missing values are indicated by -1. */ public abstract int getOrd(int docID); @@ -71,7 +72,7 @@ public abstract class SortedDocValues extends BinaryDocValues { public static final SortedDocValues EMPTY = new SortedDocValues() { @Override public int getOrd(int docID) { - return 0; + return -1; } @Override @@ -83,7 +84,7 @@ public abstract class SortedDocValues extends BinaryDocValues { @Override public int getValueCount() { - return 1; + return 0; } }; diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java index d337a0ca1c5..4d42a2e15c9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java @@ -30,19 +30,19 @@ import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.Counter; import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.packed.AppendingPackedLongBuffer; +import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer; import org.apache.lucene.util.packed.PackedInts; /** Buffers up pending byte[] per doc, deref and sorting via * int ord, then flushes when segment flushes. */ class SortedDocValuesWriter extends DocValuesWriter { final BytesRefHash hash; - private AppendingPackedLongBuffer pending; + private AppendingDeltaPackedLongBuffer pending; private final Counter iwBytesUsed; private long bytesUsed; // this currently only tracks differences in 'pending' private final FieldInfo fieldInfo; - private static final BytesRef EMPTY = new BytesRef(BytesRef.EMPTY_BYTES); + private static final int EMPTY_ORD = -1; public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { this.fieldInfo = fieldInfo; @@ -52,7 +52,7 @@ class SortedDocValuesWriter extends DocValuesWriter { new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)), BytesRefHash.DEFAULT_CAPACITY, new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed)); - pending = new AppendingPackedLongBuffer(PackedInts.COMPACT); + pending = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); bytesUsed = pending.ramBytesUsed(); iwBytesUsed.addAndGet(bytesUsed); } @@ -70,7 +70,7 @@ class SortedDocValuesWriter extends DocValuesWriter { // Fill in any holes: while(pending.size() < docID) { - addOneValue(EMPTY); + pending.add(EMPTY_ORD); } addOneValue(value); @@ -79,8 +79,9 @@ class SortedDocValuesWriter extends DocValuesWriter { @Override public void finish(int maxDoc) { while(pending.size() < maxDoc) { - addOneValue(EMPTY); + pending.add(EMPTY_ORD); } + updateBytesUsed(); } private void addOneValue(BytesRef value) { @@ -177,7 +178,7 @@ class SortedDocValuesWriter extends DocValuesWriter { // iterates over the ords for each doc we have in ram private class OrdsIterator implements Iterator { - final AppendingPackedLongBuffer.Iterator iter = pending.iterator(); + final AppendingDeltaPackedLongBuffer.Iterator iter = pending.iterator(); final int ordMap[]; final int maxDoc; int docUpto; @@ -200,8 +201,7 @@ class SortedDocValuesWriter extends DocValuesWriter { } int ord = (int) iter.next(); docUpto++; - // TODO: make reusable Number - return ordMap[ord]; + return ord == -1 ? ord : ordMap[ord]; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldCache.java b/lucene/core/src/java/org/apache/lucene/search/FieldCache.java index f87fb51acf3..cce1024ed5d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FieldCache.java +++ b/lucene/core/src/java/org/apache/lucene/search/FieldCache.java @@ -104,26 +104,6 @@ public interface FieldCache { } }; } - - /** Returns MISSING/-1 ordinal for every document */ - public static final SortedDocValues EMPTY_TERMSINDEX = new SortedDocValues() { - @Override - public int getOrd(int docID) { - return -1; - } - - @Override - public void lookupOrd(int ord, BytesRef result) { - result.bytes = MISSING; - result.offset = 0; - result.length = 0; - } - - @Override - public int getValueCount() { - return 0; - } - }; /** * Placeholder indicating creation of this cache is currently in-progress. @@ -266,13 +246,10 @@ public interface FieldCache { } }; - /** Checks the internal cache for an appropriate entry, and if none is found, * reads the terms in field and returns a bit set at the size of * reader.maxDoc(), with turned on bits for each docid that - * does have a value for this field. Note that if the field was only indexed - * as DocValues then this method will not work (it will return a Bits stating - * that no documents contain the field). + * does have a value for this field. */ public Bits getDocsWithField(AtomicReader reader, String field) throws IOException; diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java b/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java index 2dc5cb6bf9a..b8e81d17ea4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java +++ b/lucene/core/src/java/org/apache/lucene/search/FieldCacheImpl.java @@ -501,8 +501,7 @@ class FieldCacheImpl implements FieldCache { // field does not exist or has no value return new Bits.MatchNoBits(reader.maxDoc()); } else if (fieldInfo.hasDocValues()) { - // doc values are dense - return new Bits.MatchAllBits(reader.maxDoc()); + return reader.getDocsWithField(field); } else if (!fieldInfo.isIndexed()) { return new Bits.MatchNoBits(reader.maxDoc()); } @@ -944,13 +943,13 @@ class FieldCacheImpl implements FieldCache { } else { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { - return EMPTY_TERMSINDEX; + return SortedDocValues.EMPTY; } else if (info.hasDocValues()) { // we don't try to build a sorted instance from numeric/binary doc // values because dedup can be very costly throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } else if (!info.isIndexed()) { - return EMPTY_TERMSINDEX; + return SortedDocValues.EMPTY; } return (SortedDocValues) caches.get(SortedDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio), false); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java b/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java index 2c498ae7211..da1cf218617 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java @@ -569,7 +569,7 @@ public class TestFieldCache extends LuceneTestCase { assertEquals(2, sortedSet.getValueCount()); bits = FieldCache.DEFAULT.getDocsWithField(ar, "sortedset"); - assertTrue(bits instanceof Bits.MatchAllBits); + assertTrue(bits.get(0)); } ir.close(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java index ad7cb27caca..80daa1cc588 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesProducer.java @@ -31,15 +31,18 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; class Facet42DocValuesProducer extends DocValuesProducer { private final Map fields = new HashMap(); + private final int maxDoc; Facet42DocValuesProducer(SegmentReadState state) throws IOException { String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Facet42DocValuesFormat.EXTENSION); IndexInput in = state.directory.openInput(fileName, state.context); + this.maxDoc = state.segmentInfo.getDocCount(); boolean success = false; try { CodecUtil.checkHeader(in, Facet42DocValuesFormat.CODEC, @@ -80,6 +83,11 @@ class Facet42DocValuesProducer extends DocValuesProducer { throw new UnsupportedOperationException("FacetsDocValues only implements binary"); } + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + return new Bits.MatchAllBits(maxDoc); // TODO: have codec impl this? + } + @Override public void close() throws IOException { } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/util/FacetsPayloadMigrationReader.java b/lucene/facet/src/java/org/apache/lucene/facet/util/FacetsPayloadMigrationReader.java index ec4d2c9d638..e3d998d5159 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/util/FacetsPayloadMigrationReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/util/FacetsPayloadMigrationReader.java @@ -40,6 +40,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; /** @@ -222,6 +223,18 @@ public class FacetsPayloadMigrationReader extends FilterAtomicReader { } } + @Override + public Bits getDocsWithField(String field) throws IOException { + Term term = fieldTerms.get(field); + if (term == null) { + return super.getDocsWithField(field); + } else { + // we shouldn't return null, even if the term does not exist or has no + // payloads, since we already marked the field as having DocValues. + return new Bits.MatchAllBits(maxDoc()); + } + } + @Override public FieldInfos getFieldInfos() { FieldInfos innerInfos = super.getFieldInfos(); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index a207b970de3..a2b1bd45b2d 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -422,6 +422,11 @@ public class WeightedSpanTermExtractor { public NumericDocValues getNormValues(String field) throws IOException { return super.getNormValues(FIELD_NAME); } + + @Override + public Bits getDocsWithField(String field) throws IOException { + return super.getDocsWithField(FIELD_NAME); + } } /** diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 8107aab3b68..3a0c6bbe62f 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -756,6 +756,11 @@ public class MemoryIndex { return null; } + @Override + public Bits getDocsWithField(String field) throws IOException { + return null; + } + private class MemoryFields extends Fields { @Override public Iterator iterator() { diff --git a/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java b/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java index f7cce128692..469357dab63 100644 --- a/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java +++ b/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java @@ -221,6 +221,27 @@ public class SortingAtomicReader extends FilterAtomicReader { } } + private static class SortingBits implements Bits { + + private final Bits in; + private final Sorter.DocMap docMap; + + public SortingBits(final Bits in, Sorter.DocMap docMap) { + this.in = in; + this.docMap = docMap; + } + + @Override + public boolean get(int index) { + return in.get(docMap.newToOld(index)); + } + + @Override + public int length() { + return in.length(); + } + } + private static class SortingSortedDocValues extends SortedDocValues { private final SortedDocValues in; @@ -743,20 +764,9 @@ public class SortingAtomicReader extends FilterAtomicReader { final Bits inLiveDocs = in.getLiveDocs(); if (inLiveDocs == null) { return null; + } else { + return new SortingBits(inLiveDocs, docMap); } - return new Bits() { - - @Override - public boolean get(int index) { - return inLiveDocs.get(docMap.newToOld(index)); - } - - @Override - public int length() { - return inLiveDocs.length(); - } - - }; } @Override @@ -796,6 +806,16 @@ public class SortingAtomicReader extends FilterAtomicReader { } } + @Override + public Bits getDocsWithField(String field) throws IOException { + Bits bits = in.getDocsWithField(field); + if (bits == null || bits instanceof Bits.MatchAllBits || bits instanceof Bits.MatchNoBits) { + return bits; + } else { + return new SortingBits(bits, docMap); + } + } + @Override public Fields getTermVectors(final int docID) throws IOException { return in.getTermVectors(docMap.newToOld(docID)); diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java index 998bea7835f..a59e4c58c06 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java @@ -33,6 +33,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.OpenBitSet; @@ -75,11 +76,10 @@ public class AssertingDocValuesFormat extends DocValuesFormat { public void addNumericField(FieldInfo field, Iterable values) throws IOException { int count = 0; for (Number v : values) { - assert v != null; count++; } assert count == maxDoc; - checkIterator(values.iterator(), maxDoc); + checkIterator(values.iterator(), maxDoc, true); in.addNumericField(field, values); } @@ -87,12 +87,11 @@ public class AssertingDocValuesFormat extends DocValuesFormat { public void addBinaryField(FieldInfo field, Iterable values) throws IOException { int count = 0; for (BytesRef b : values) { - assert b != null; - assert b.isValid(); + assert b == null || b.isValid(); count++; } assert count == maxDoc; - checkIterator(values.iterator(), maxDoc); + checkIterator(values.iterator(), maxDoc, true); in.addBinaryField(field, values); } @@ -117,15 +116,17 @@ public class AssertingDocValuesFormat extends DocValuesFormat { for (Number v : docToOrd) { assert v != null; int ord = v.intValue(); - assert ord >= 0 && ord < valueCount; - seenOrds.set(ord); + assert ord >= -1 && ord < valueCount; + if (ord >= 0) { + seenOrds.set(ord); + } count++; } assert count == maxDoc; assert seenOrds.cardinality() == valueCount; - checkIterator(values.iterator(), valueCount); - checkIterator(docToOrd.iterator(), maxDoc); + checkIterator(values.iterator(), valueCount, false); + checkIterator(docToOrd.iterator(), maxDoc, false); in.addSortedField(field, values, docToOrd); } @@ -169,18 +170,18 @@ public class AssertingDocValuesFormat extends DocValuesFormat { assert docCount == maxDoc; assert seenOrds.cardinality() == valueCount; - checkIterator(values.iterator(), valueCount); - checkIterator(docToOrdCount.iterator(), maxDoc); - checkIterator(ords.iterator(), ordCount); + checkIterator(values.iterator(), valueCount, false); + checkIterator(docToOrdCount.iterator(), maxDoc, false); + checkIterator(ords.iterator(), ordCount, false); in.addSortedSetField(field, values, docToOrdCount, ords); } - private void checkIterator(Iterator iterator, long expectedSize) { + private void checkIterator(Iterator iterator, long expectedSize, boolean allowNull) { for (long i = 0; i < expectedSize; i++) { boolean hasNext = iterator.hasNext(); assert hasNext; T v = iterator.next(); - assert v != null; + assert allowNull || v != null; try { iterator.remove(); throw new AssertionError("broken iterator (supports remove): " + iterator); @@ -244,6 +245,15 @@ public class AssertingDocValuesFormat extends DocValuesFormat { assert values != null; return new AssertingAtomicReader.AssertingSortedSetDocValues(values, maxDoc); } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + assert field.getDocValuesType() != null; + Bits bits = in.getDocsWithField(field); + assert bits != null; + assert bits.length() == maxDoc; + return bits; // TODO: add AssertingBits w/ bounds check + } @Override public void close() throws IOException { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java index 52f36d2ae40..f6098dc1f21 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java @@ -27,6 +27,7 @@ import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.DocValuesProducer.SortedSetDocsWithField; import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer; import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; import org.apache.lucene.index.BinaryDocValues; @@ -38,6 +39,7 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.BlockPackedReader; @@ -50,9 +52,11 @@ class CheapBastardDocValuesProducer extends DocValuesProducer { private final Map ordIndexes; private final Map binaries; private final IndexInput data; + private final int maxDoc; CheapBastardDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + this.maxDoc = state.segmentInfo.getDocCount(); // read in the entries from the metadata file. IndexInput in = state.directory.openInput(metaName, state.context); boolean success = false; @@ -380,6 +384,15 @@ class CheapBastardDocValuesProducer extends DocValuesProducer { } }; } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) { + return new SortedSetDocsWithField(getSortedSet(field), maxDoc); + } else { + return new Bits.MatchAllBits(maxDoc); + } + } @Override public void close() throws IOException { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java index 2769abd6e13..15b3081c848 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java @@ -24,7 +24,9 @@ import java.util.TreeSet; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.MissingOrdRemapper; import org.apache.lucene.codecs.lucene40.Lucene40FieldInfosReader.LegacyDocValuesType; +import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; @@ -54,7 +56,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { long minValue = Long.MAX_VALUE; long maxValue = Long.MIN_VALUE; for (Number n : values) { - long v = n.longValue(); + long v = n == null ? 0 : n.longValue(); minValue = Math.min(minValue, v); maxValue = Math.max(maxValue, v); } @@ -92,7 +94,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { Lucene40DocValuesFormat.INTS_VERSION_CURRENT); output.writeInt(1); // size for (Number n : values) { - output.writeByte(n.byteValue()); + output.writeByte(n == null ? 0 : n.byteValue()); } } @@ -103,7 +105,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { Lucene40DocValuesFormat.INTS_VERSION_CURRENT); output.writeInt(2); // size for (Number n : values) { - output.writeShort(n.shortValue()); + output.writeShort(n == null ? 0 : n.shortValue()); } } @@ -114,7 +116,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { Lucene40DocValuesFormat.INTS_VERSION_CURRENT); output.writeInt(4); // size for (Number n : values) { - output.writeInt(n.intValue()); + output.writeInt(n == null ? 0 : n.intValue()); } } @@ -131,7 +133,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { // writes longs output.writeByte(Lucene40DocValuesFormat.VAR_INTS_FIXED_64); for (Number n : values) { - output.writeLong(n.longValue()); + output.writeLong(n == null ? 0 : n.longValue()); } } else { // writes packed ints @@ -143,7 +145,8 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { PackedInts.bitsRequired(delta), PackedInts.DEFAULT); for (Number n : values) { - writer.add(n.longValue() - minValue); + long v = n == null ? 0 : n.longValue(); + writer.add(v - minValue); } writer.finish(); } @@ -156,6 +159,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { int minLength = Integer.MAX_VALUE; int maxLength = Integer.MIN_VALUE; for (BytesRef b : values) { + if (b == null) { + b = new BytesRef(); // 4.0 doesnt distinguish + } minLength = Math.min(minLength, b.length); maxLength = Math.max(maxLength, b.length); if (uniqueValues != null) { @@ -243,7 +249,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { output.writeInt(length); for (BytesRef v : values) { - output.writeBytes(v.bytes, v.offset, v.length); + if (v != null) { + output.writeBytes(v.bytes, v.offset, v.length); + } } } @@ -264,7 +272,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { final long startPos = data.getFilePointer(); for (BytesRef v : values) { - data.writeBytes(v.bytes, v.offset, v.length); + if (v != null) { + data.writeBytes(v.bytes, v.offset, v.length); + } } /* addresses */ @@ -279,7 +289,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { long currentPosition = 0; for (BytesRef v : values) { w.add(currentPosition); - currentPosition += v.length; + if (v != null) { + currentPosition += v.length; + } } // write sentinel assert currentPosition == maxAddress; @@ -301,7 +313,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { // deduplicate TreeSet dictionary = new TreeSet(); for (BytesRef v : values) { - dictionary.add(BytesRef.deepCopyOf(v)); + dictionary.add(v == null ? new BytesRef() : BytesRef.deepCopyOf(v)); } /* values */ @@ -318,6 +330,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(valueCount-1), PackedInts.DEFAULT); for (BytesRef v : values) { + if (v == null) { + v = new BytesRef(); + } int ord = dictionary.headSet(v).size(); w.add(ord); } @@ -338,7 +353,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { // deduplicate TreeSet dictionary = new TreeSet(); for (BytesRef v : values) { - dictionary.add(BytesRef.deepCopyOf(v)); + dictionary.add(v == null ? new BytesRef() : BytesRef.deepCopyOf(v)); } /* values */ @@ -359,7 +374,7 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { final PackedInts.Writer w = PackedInts.getWriter(index, maxDoc, PackedInts.bitsRequired(currentAddress), PackedInts.DEFAULT); for (BytesRef v : values) { - w.add(valueToAddress.get(v)); + w.add(valueToAddress.get(v == null ? new BytesRef() : v)); } w.finish(); } @@ -385,6 +400,15 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { maxLength = Math.max(maxLength, b.length); } + // but dont use fixed if there are missing values (we are simulating how lucene40 wrote dv...) + boolean anyMissing = false; + for (Number n : docToOrd) { + if (n.longValue() == -1) { + anyMissing = true; + break; + } + } + boolean success = false; IndexOutput data = null; IndexOutput index = null; @@ -394,12 +418,22 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { try { data = dir.createOutput(dataName, state.context); index = dir.createOutput(indexName, state.context); - if (minLength == maxLength) { + if (minLength == maxLength && !anyMissing) { // fixed byte[] addFixedSortedBytesField(field, data, index, values, docToOrd, minLength); } else { // var byte[] - addVarSortedBytesField(field, data, index, values, docToOrd); + // three cases for simulating the old writer: + // 1. no missing + // 2. missing (and empty string in use): remap ord=-1 -> ord=0 + // 3. missing (and empty string not in use): remap all ords +1, insert empty string into values + if (!anyMissing) { + addVarSortedBytesField(field, data, index, values, docToOrd); + } else if (minLength == 0) { + addVarSortedBytesField(field, data, index, values, MissingOrdRemapper.mapMissingToOrd0(docToOrd)); + } else { + addVarSortedBytesField(field, data, index, MissingOrdRemapper.insertEmptyValue(values), MissingOrdRemapper.mapAllOrds(docToOrd)); + } } success = true; } finally { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index 81c5cbef204..7dd38ae1c69 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -52,6 +52,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.LuceneTestCase; @@ -1073,8 +1074,10 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { doc.add(newTextField("id", "noValue", Field.Store.YES)); w.addDocument(doc); } - BytesRef bytesRef = new BytesRef(); - hash.add(bytesRef); // add empty value for the gaps + if (!codecSupportsDocsWithField("field")) { + BytesRef bytesRef = new BytesRef(); + hash.add(bytesRef); // add empty value for the gaps + } if (rarely()) { w.commit(); } @@ -2197,5 +2200,205 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { doTestNumericsVsStoredFields(longs); } } + + public void testTwoNumbersOneMissing() throws IOException { + assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 0)); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + NumericDocValues dv = ar.getNumericDocValues("dv1"); + assertEquals(0, dv.get(0)); + assertEquals(0, dv.get(1)); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + ir.close(); + directory.close(); + } + + public void testTwoNumbersOneMissingWithMerging() throws IOException { + assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 0)); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + NumericDocValues dv = ar.getNumericDocValues("dv1"); + assertEquals(0, dv.get(0)); + assertEquals(0, dv.get(1)); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + ir.close(); + directory.close(); + } + + public void testThreeNumbersOneMissingWithMerging() throws IOException { + assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 0)); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "2", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 5)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + NumericDocValues dv = ar.getNumericDocValues("dv1"); + assertEquals(0, dv.get(0)); + assertEquals(0, dv.get(1)); + assertEquals(5, dv.get(2)); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + assertTrue(docsWithField.get(2)); + ir.close(); + directory.close(); + } + + public void testTwoBytesOneMissing() throws IOException { + assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", new BytesRef())); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + BinaryDocValues dv = ar.getBinaryDocValues("dv1"); + BytesRef ref = new BytesRef(); + dv.get(0, ref); + assertEquals(new BytesRef(), ref); + dv.get(1, ref); + assertEquals(new BytesRef(), ref); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + ir.close(); + directory.close(); + } + + public void testTwoBytesOneMissingWithMerging() throws IOException { + assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", new BytesRef())); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + BinaryDocValues dv = ar.getBinaryDocValues("dv1"); + BytesRef ref = new BytesRef(); + dv.get(0, ref); + assertEquals(new BytesRef(), ref); + dv.get(1, ref); + assertEquals(new BytesRef(), ref); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + ir.close(); + directory.close(); + } + + public void testThreeBytesOneMissingWithMerging() throws IOException { + assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", new BytesRef())); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "2", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", new BytesRef("boo"))); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = DirectoryReader.open(directory); + assertEquals(1, ir.leaves().size()); + AtomicReader ar = ir.leaves().get(0).reader(); + BinaryDocValues dv = ar.getBinaryDocValues("dv1"); + BytesRef ref = new BytesRef(); + dv.get(0, ref); + assertEquals(new BytesRef(), ref); + dv.get(1, ref); + assertEquals(new BytesRef(), ref); + dv.get(2, ref); + assertEquals(new BytesRef("boo"), ref); + Bits docsWithField = ar.getDocsWithField("dv1"); + assertTrue(docsWithField.get(0)); + assertFalse(docsWithField.get(1)); + assertTrue(docsWithField.get(2)); + ir.close(); + directory.close(); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java index a0a2521ef7d..66c24874b55 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/FieldFilterAtomicReader.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.Set; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.FilterIterator; /** @@ -136,6 +137,11 @@ public final class FieldFilterAtomicReader extends FilterAtomicReader { return hasField(field) ? super.getNormValues(field) : null; } + @Override + public Bits getDocsWithField(String field) throws IOException { + return hasField(field) ? super.getDocsWithField(field) : null; + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("FieldFilterAtomicReader(reader="); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java index 3060d683c26..f14f772e0df 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java @@ -1368,6 +1368,13 @@ public abstract class LuceneTestCase extends Assert { } return true; } + + /** Returns true if the codec for the field "supports" docsWithField + * (other codecs return MatchAllBits, because you couldnt write missing values before) */ + public static boolean codecSupportsDocsWithField(String field) { + // currently only one codec! + return _TestUtil.getDocValuesFormat(Codec.getDefault(), field).equals("SimpleText"); + } public void assertReaderEquals(String info, IndexReader leftReader, IndexReader rightReader) throws IOException { assertReaderStatisticsEquals(info, leftReader, rightReader); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index e68d42c2bf0..38b2592fdeb 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -45,6 +45,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; @@ -742,6 +743,15 @@ public class _TestUtil { return p.getName(); } } + + public static String getDocValuesFormat(Codec codec, String field) { + DocValuesFormat d = codec.docValuesFormat(); + if (d instanceof PerFieldDocValuesFormat) { + return ((PerFieldDocValuesFormat)d).getDocValuesFormatForField(field).getName(); + } else { + return d.getName(); + } + } public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException { String[] files = dir.listAll(); diff --git a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java index 4dfc114a189..bca44a19d76 100644 --- a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java +++ b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java @@ -221,6 +221,7 @@ public class DocValuesFacets { if (schemaField.multiValued()) { missingCount = SimpleFacets.getFieldMissingCount(searcher,docs,schemaField.getName()); } else { + // nocommit: support missing count (ord = -1) for single-valued here. missingCount = 0; // single-valued dv is implicitly 0 } } diff --git a/solr/core/src/test/org/apache/solr/search/TestDocSet.java b/solr/core/src/test/org/apache/solr/search/TestDocSet.java index faebe6c261b..c2a282066e5 100644 --- a/solr/core/src/test/org/apache/solr/search/TestDocSet.java +++ b/solr/core/src/test/org/apache/solr/search/TestDocSet.java @@ -403,6 +403,11 @@ public class TestDocSet extends LuceneTestCase { return null; } + @Override + public Bits getDocsWithField(String field) throws IOException { + return null; + } + @Override public NumericDocValues getNormValues(String field) { return null; From 2c6bf041900192ff7ec58b0d2bc0c8dcfae85b5c Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 16 Aug 2013 21:19:19 +0000 Subject: [PATCH 03/16] bump 4.5 codec git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1514897 13f79535-47bb-0310-9956-ffa450edef68 --- .../codecs/diskdv/DiskDocValuesFormat.java | 17 +- .../codecs/diskdv/DiskDocValuesProducer.java | 433 ++-------- .../codecs/diskdv/DiskNormsFormat.java} | 18 +- .../java/org/apache/lucene/codecs/Codec.java | 2 +- .../org/apache/lucene/codecs/FilterCodec.java | 4 +- .../lucene/codecs/lucene40/Lucene40Codec.java | 1 - .../lucene/codecs/lucene42/package.html | 8 +- .../lucene/codecs/lucene45/Lucene45Codec.java | 141 ++++ .../lucene45/Lucene45DocValuesConsumer.java} | 24 +- .../lucene45/Lucene45DocValuesFormat.java | 167 ++++ .../lucene45/Lucene45DocValuesProducer.java | 755 ++++++++++++++++++ .../lucene/codecs/lucene45/package.html | 396 +++++++++ .../org/apache/lucene/codecs/package.html | 8 +- .../org/apache/lucene/index/CheckIndex.java | 2 +- .../services/org.apache.lucene.codecs.Codec | 1 + .../org.apache.lucene.codecs.DocValuesFormat | 1 + .../org/apache/lucene/TestExternalCodecs.java | 24 +- .../TestLucene45DocValuesFormat.java} | 9 +- .../perfield/TestPerFieldDocValuesFormat.java | 6 +- .../perfield/TestPerFieldPostingsFormat2.java | 10 +- .../apache/lucene/index/TestAddIndexes.java | 6 +- .../index/TestAllFilesHaveCodecHeader.java | 4 +- .../lucene/index/TestDuelingCodecs.java | 2 +- .../lucene/util/TestNamedSPILoader.java | 6 +- .../facet/codecs/facet42/Facet42Codec.java | 1 + .../facet42/Facet42DocValuesConsumer.java | 12 +- .../valuesource/DoubleFieldSource.java | 4 +- .../valuesource/FloatFieldSource.java | 4 +- .../function/valuesource/IntFieldSource.java | 4 +- .../function/valuesource/LongFieldSource.java | 4 +- lucene/site/xsl/index.xsl | 2 +- .../bbox/BBoxSimilarityValueSource.java | 6 +- .../analyzing/AnalyzingInfixSuggester.java | 4 +- .../codecs/asserting/AssertingCodec.java | 6 +- .../asserting/AssertingDocValuesFormat.java | 6 +- .../asserting/AssertingNormsFormat.java | 1 + .../cheapbastard/CheapBastardCodec.java | 10 +- .../CheapBastardDocValuesFormat.java | 74 -- .../CheapBastardDocValuesProducer.java | 444 ---------- .../codecs/compressing/CompressingCodec.java | 6 +- .../compressing/FastCompressingCodec.java | 4 +- .../FastDecompressionCompressingCodec.java | 1 + .../HighCompressionCompressingCodec.java | 1 + .../index/BaseStoredFieldsFormatTestCase.java | 4 +- .../org/apache/lucene/index/RandomCodec.java | 12 +- .../util/TestRuleSetupAndRestoreClassEnv.java | 4 +- .../org/apache/lucene/util/_TestUtil.java | 11 +- .../org.apache.lucene.codecs.DocValuesFormat | 1 - .../apache/solr/core/SchemaCodecFactory.java | 4 +- .../apache/solr/request/NumericFacets.java | 5 +- .../solr/collection1/conf/schema_codec.xml | 2 +- .../apache/solr/core/TestCodecSupport.java | 8 +- 52 files changed, 1673 insertions(+), 1017 deletions(-) rename lucene/{test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardNormsFormat.java => codecs/src/java/org/apache/lucene/codecs/diskdv/DiskNormsFormat.java} (70%) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45Codec.java rename lucene/{codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java => core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java} (93%) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene45/package.html rename lucene/{codecs/src/test/org/apache/lucene/codecs/diskdv/TestCheapBastardDocValuesFormat.java => core/src/test/org/apache/lucene/codecs/lucene45/TestLucene45DocValuesFormat.java} (79%) delete mode 100644 lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java delete mode 100644 lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java index 43a7d57eecf..f3fd35e6554 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesFormat.java @@ -22,8 +22,11 @@ import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer; +import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.BytesRef; /** * DocValues format that keeps most things on disk. @@ -40,7 +43,12 @@ public final class DiskDocValuesFormat extends DocValuesFormat { @Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return new DiskDocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + return new Lucene45DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION) { + @Override + protected void addTermsDict(FieldInfo field, Iterable values) throws IOException { + addBinaryField(field, values); + } + }; } @Override @@ -52,11 +60,4 @@ public final class DiskDocValuesFormat extends DocValuesFormat { public static final String DATA_EXTENSION = "dvdd"; public static final String META_CODEC = "DiskDocValuesMetadata"; public static final String META_EXTENSION = "dvdm"; - public static final int VERSION_START = 0; - public static final int VERSION_COMPRESSED_TERMS = 1; - public static final int VERSION_CURRENT = VERSION_COMPRESSED_TERMS; - public static final byte NUMERIC = 0; - public static final byte BINARY = 1; - public static final byte SORTED = 2; - public static final byte SORTED_SET = 3; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java index 11a60fdaf6c..41d2e87b9fe 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java @@ -17,35 +17,26 @@ package org.apache.lucene.codecs.diskdv; * limitations under the License. */ -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.DELTA_COMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.GCD_COMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.TABLE_COMPRESSED; - -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.BINARY_PREFIX_COMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.DELTA_COMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.GCD_COMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.TABLE_COMPRESSED; import java.io.IOException; -import java.util.Comparator; import java.util.HashMap; import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.DocValuesProducer.SortedSetDocsWithField; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; @@ -56,32 +47,28 @@ import org.apache.lucene.util.packed.PackedInts; class DiskDocValuesProducer extends DocValuesProducer { private final Map numerics; - private final Map binaries; private final Map ords; private final Map ordIndexes; + private final Map binaries; private final IndexInput data; private final int maxDoc; - - // memory-resident structures - private final Map addressInstances = new HashMap(); - private final Map ordIndexInstances = new HashMap(); DiskDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + this.maxDoc = state.segmentInfo.getDocCount(); // read in the entries from the metadata file. IndexInput in = state.directory.openInput(metaName, state.context); - this.maxDoc = state.segmentInfo.getDocCount(); boolean success = false; final int version; try { version = CodecUtil.checkHeader(in, metaCodec, - DiskDocValuesFormat.VERSION_CURRENT, - DiskDocValuesFormat.VERSION_CURRENT); + Lucene45DocValuesFormat.VERSION_CURRENT, + Lucene45DocValuesFormat.VERSION_CURRENT); numerics = new HashMap(); ords = new HashMap(); ordIndexes = new HashMap(); binaries = new HashMap(); - readFields(in, state.fieldInfos); + readFields(in); success = true; } finally { @@ -97,10 +84,10 @@ class DiskDocValuesProducer extends DocValuesProducer { String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); data = state.directory.openInput(dataName, state.context); final int version2 = CodecUtil.checkHeader(data, dataCodec, - DiskDocValuesFormat.VERSION_CURRENT, - DiskDocValuesFormat.VERSION_CURRENT); + Lucene45DocValuesFormat.VERSION_CURRENT, + Lucene45DocValuesFormat.VERSION_CURRENT); if (version != version2) { - throw new CorruptIndexException("Format versions mismatch"); + throw new CorruptIndexException("Versions mismatch"); } success = true; @@ -109,61 +96,62 @@ class DiskDocValuesProducer extends DocValuesProducer { IOUtils.closeWhileHandlingException(this.data); } } + } - private void readFields(IndexInput meta, FieldInfos infos) throws IOException { + private void readFields(IndexInput meta) throws IOException { int fieldNumber = meta.readVInt(); while (fieldNumber != -1) { byte type = meta.readByte(); - if (type == DiskDocValuesFormat.NUMERIC) { + if (type == Lucene45DocValuesFormat.NUMERIC) { numerics.put(fieldNumber, readNumericEntry(meta)); - } else if (type == DiskDocValuesFormat.BINARY) { + } else if (type == Lucene45DocValuesFormat.BINARY) { BinaryEntry b = readBinaryEntry(meta); binaries.put(fieldNumber, b); - } else if (type == DiskDocValuesFormat.SORTED) { + } else if (type == Lucene45DocValuesFormat.SORTED) { // sorted = binary + numeric if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); } - if (meta.readByte() != DiskDocValuesFormat.BINARY) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + if (meta.readByte() != Lucene45DocValuesFormat.BINARY) { + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); } BinaryEntry b = readBinaryEntry(meta); binaries.put(fieldNumber, b); if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); } NumericEntry n = readNumericEntry(meta); ords.put(fieldNumber, n); - } else if (type == DiskDocValuesFormat.SORTED_SET) { + } else if (type == Lucene45DocValuesFormat.SORTED_SET) { // sortedset = binary + numeric + ordIndex if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); } - if (meta.readByte() != DiskDocValuesFormat.BINARY) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + if (meta.readByte() != Lucene45DocValuesFormat.BINARY) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); } BinaryEntry b = readBinaryEntry(meta); binaries.put(fieldNumber, b); if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); } NumericEntry n1 = readNumericEntry(meta); ords.put(fieldNumber, n1); if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); } NumericEntry n2 = readNumericEntry(meta); ordIndexes.put(fieldNumber, n2); @@ -209,27 +197,18 @@ class DiskDocValuesProducer extends DocValuesProducer { static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException { BinaryEntry entry = new BinaryEntry(); - entry.format = meta.readVInt(); + int format = meta.readVInt(); + if (format != Lucene45DocValuesConsumer.BINARY_FIXED_UNCOMPRESSED && format != Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED) { + throw new CorruptIndexException("Unexpected format for binary entry: " + format + ", input=" + meta); + } entry.minLength = meta.readVInt(); entry.maxLength = meta.readVInt(); entry.count = meta.readVLong(); entry.offset = meta.readLong(); - switch(entry.format) { - case BINARY_FIXED_UNCOMPRESSED: - break; - case BINARY_PREFIX_COMPRESSED: - entry.addressInterval = meta.readVInt(); - entry.addressesOffset = meta.readLong(); - entry.packedIntsVersion = meta.readVInt(); - entry.blockSize = meta.readVInt(); - break; - case BINARY_VARIABLE_UNCOMPRESSED: - entry.addressesOffset = meta.readLong(); - entry.packedIntsVersion = meta.readVInt(); - entry.blockSize = meta.readVInt(); - break; - default: - throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); + if (entry.minLength != entry.maxLength) { + entry.addressesOffset = meta.readLong(); + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); } return entry; } @@ -237,10 +216,10 @@ class DiskDocValuesProducer extends DocValuesProducer { @Override public NumericDocValues getNumeric(FieldInfo field) throws IOException { NumericEntry entry = numerics.get(field.number); - return getNumeric(entry); + return getNumeric(field, entry); } - LongNumericDocValues getNumeric(NumericEntry entry) throws IOException { + private LongNumericDocValues getNumeric(FieldInfo field, final NumericEntry entry) throws IOException { final IndexInput data = this.data.clone(); data.seek(entry.offset); @@ -264,12 +243,12 @@ class DiskDocValuesProducer extends DocValuesProducer { } }; case TABLE_COMPRESSED: - final long table[] = entry.table; + final long[] table = entry.table; final int bitsRequired = PackedInts.bitsRequired(table.length - 1); final PackedInts.Reader ords = PackedInts.getDirectReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, (int) entry.count, bitsRequired); return new LongNumericDocValues() { @Override - public long get(long id) { + long get(long id) { return table[(int) ords.get((int) id)]; } }; @@ -281,15 +260,10 @@ class DiskDocValuesProducer extends DocValuesProducer { @Override public BinaryDocValues getBinary(FieldInfo field) throws IOException { BinaryEntry bytes = binaries.get(field.number); - switch(bytes.format) { - case BINARY_FIXED_UNCOMPRESSED: - return getFixedBinary(field, bytes); - case BINARY_VARIABLE_UNCOMPRESSED: - return getVariableBinary(field, bytes); - case BINARY_PREFIX_COMPRESSED: - return getCompressedBinary(field, bytes); - default: - throw new AssertionError(); + if (bytes.minLength == bytes.maxLength) { + return getFixedBinary(field, bytes); + } else { + return getVariableBinary(field, bytes); } } @@ -318,22 +292,13 @@ class DiskDocValuesProducer extends DocValuesProducer { private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { final IndexInput data = this.data.clone(); - - final MonotonicBlockPackedReader addresses; - synchronized (addressInstances) { - MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number); - if (addrInstance == null) { - data.seek(bytes.addressesOffset); - addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false); - addressInstances.put(field.number, addrInstance); - } - addresses = addrInstance; - } + data.seek(bytes.addressesOffset); + final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, true); return new LongBinaryDocValues() { @Override public void get(long id, BytesRef result) { - long startAddress = bytes.offset + (id == 0 ? 0 : addresses.get(id-1)); + long startAddress = bytes.offset + (id == 0 ? 0 : + addresses.get(id-1)); long endAddress = bytes.offset + addresses.get(id); int length = (int) (endAddress - startAddress); try { @@ -352,39 +317,11 @@ class DiskDocValuesProducer extends DocValuesProducer { }; } - private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final IndexInput data = this.data.clone(); - final long interval = bytes.addressInterval; - - final MonotonicBlockPackedReader addresses; - synchronized (addressInstances) { - MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number); - if (addrInstance == null) { - data.seek(bytes.addressesOffset); - final long size; - if (bytes.count % interval == 0) { - size = bytes.count / interval; - } else { - size = 1L + bytes.count / interval; - } - addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false); - addressInstances.put(field.number, addrInstance); - } - addresses = addrInstance; - } - - return new CompressedBinaryDocValues(bytes, addresses, data); - } - @Override public SortedDocValues getSorted(FieldInfo field) throws IOException { final int valueCount = (int) binaries.get(field.number).count; final BinaryDocValues binary = getBinary(field); - NumericEntry entry = ords.get(field.number); - IndexInput data = this.data.clone(); - data.seek(entry.offset); - final BlockPackedReader ordinals = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - + final NumericDocValues ordinals = getNumeric(field, ords.get(field.number)); return new SortedDocValues() { @Override @@ -401,46 +338,18 @@ class DiskDocValuesProducer extends DocValuesProducer { public int getValueCount() { return valueCount; } - - @Override - public int lookupTerm(BytesRef key) { - if (binary instanceof CompressedBinaryDocValues) { - return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key); - } else { - return super.lookupTerm(key); - } - } - - @Override - public TermsEnum termsEnum() { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues)binary).getTermsEnum(); - } else { - return super.termsEnum(); - } - } }; } @Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { final long valueCount = binaries.get(field.number).count; - // we keep the byte[]s and list of ords on disk, these could be large final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); - final LongNumericDocValues ordinals = getNumeric(ords.get(field.number)); - // but the addresses to the ord stream are in RAM - final MonotonicBlockPackedReader ordIndex; - synchronized (ordIndexInstances) { - MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number); - if (ordIndexInstance == null) { - NumericEntry entry = ordIndexes.get(field.number); - IndexInput data = this.data.clone(); - data.seek(entry.offset); - ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false); - ordIndexInstances.put(field.number, ordIndexInstance); - } - ordIndex = ordIndexInstance; - } + final LongNumericDocValues ordinals = getNumeric(field, ords.get(field.number)); + NumericEntry entry = ordIndexes.get(field.number); + IndexInput data = this.data.clone(); + data.seek(entry.offset); + final MonotonicBlockPackedReader ordIndex = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); return new SortedSetDocValues() { long offset; @@ -472,31 +381,11 @@ class DiskDocValuesProducer extends DocValuesProducer { public long getValueCount() { return valueCount; } - - @Override - public long lookupTerm(BytesRef key) { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues)binary).lookupTerm(key); - } else { - return super.lookupTerm(key); - } - } - - @Override - public TermsEnum termsEnum() { - if (binary instanceof CompressedBinaryDocValues) { - return ((CompressedBinaryDocValues)binary).getTermsEnum(); - } else { - return super.termsEnum(); - } - } }; } - + @Override public Bits getDocsWithField(FieldInfo field) throws IOException { - // nocommit: only use this if the field's entry has missing values (write that), - // otherwise return MatchAllBits if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) { return new SortedSetDocsWithField(getSortedSet(field), maxDoc); } else { @@ -525,12 +414,10 @@ class DiskDocValuesProducer extends DocValuesProducer { static class BinaryEntry { long offset; - int format; long count; int minLength; int maxLength; long addressesOffset; - long addressInterval; int packedIntsVersion; int blockSize; } @@ -553,204 +440,4 @@ class DiskDocValuesProducer extends DocValuesProducer { abstract void get(long id, BytesRef Result); } - - // in the compressed case, we add a few additional operations for - // more efficient reverse lookup and enumeration - static class CompressedBinaryDocValues extends LongBinaryDocValues { - final BinaryEntry bytes; - final long interval; - final long numValues; - final long numIndexValues; - final MonotonicBlockPackedReader addresses; - final IndexInput data; - final TermsEnum termsEnum; - - public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, IndexInput data) throws IOException { - this.bytes = bytes; - this.interval = bytes.addressInterval; - this.addresses = addresses; - this.data = data; - this.numValues = bytes.count; - this.numIndexValues = addresses.size(); - this.termsEnum = getTermsEnum(data); - } - - @Override - public void get(long id, BytesRef result) { - try { - termsEnum.seekExact(id); - BytesRef term = termsEnum.term(); - result.bytes = term.bytes; - result.offset = term.offset; - result.length = term.length; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - long lookupTerm(BytesRef key) { - try { - SeekStatus status = termsEnum.seekCeil(key); - if (status == SeekStatus.END) { - return -numValues-1; - } else if (status == SeekStatus.FOUND) { - return termsEnum.ord(); - } else { - return -termsEnum.ord()-1; - } - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } - } - - TermsEnum getTermsEnum() { - try { - return getTermsEnum(data.clone()); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private TermsEnum getTermsEnum(final IndexInput input) throws IOException { - input.seek(bytes.offset); - - return new TermsEnum() { - private long currentOrd = -1; - // TODO: maxLength is negative when all terms are merged away... - private final BytesRef termBuffer = new BytesRef(bytes.maxLength < 0 ? 0 : bytes.maxLength); - private final BytesRef term = new BytesRef(); // TODO: paranoia? - - @Override - public BytesRef next() throws IOException { - if (doNext() == null) { - return null; - } else { - setTerm(); - return term; - } - } - - private BytesRef doNext() throws IOException { - if (++currentOrd >= numValues) { - return null; - } else { - int start = input.readVInt(); - int suffix = input.readVInt(); - input.readBytes(termBuffer.bytes, start, suffix); - termBuffer.length = start + suffix; - return termBuffer; - } - } - - @Override - public SeekStatus seekCeil(BytesRef text) throws IOException { - // binary-search just the index values to find the block, - // then scan within the block - long low = 0; - long high = numIndexValues-1; - - while (low <= high) { - long mid = (low + high) >>> 1; - doSeek(mid * interval); - int cmp = termBuffer.compareTo(text); - - if (cmp < 0) { - low = mid + 1; - } else if (cmp > 0) { - high = mid - 1; - } else { - // we got lucky, found an indexed term - setTerm(); - return SeekStatus.FOUND; - } - } - - if (numIndexValues == 0) { - return SeekStatus.END; - } - - // block before insertion point - long block = low-1; - doSeek(block < 0 ? -1 : block * interval); - - while (doNext() != null) { - int cmp = termBuffer.compareTo(text); - if (cmp == 0) { - setTerm(); - return SeekStatus.FOUND; - } else if (cmp > 0) { - setTerm(); - return SeekStatus.NOT_FOUND; - } - } - - return SeekStatus.END; - } - - @Override - public void seekExact(long ord) throws IOException { - doSeek(ord); - setTerm(); - } - - private void doSeek(long ord) throws IOException { - long block = ord / interval; - - if (ord >= currentOrd && block == currentOrd / interval) { - // seek within current block - } else { - // position before start of block - currentOrd = ord - ord % interval - 1; - input.seek(bytes.offset + addresses.get(block)); - } - - while (currentOrd < ord) { - doNext(); - } - } - - private void setTerm() { - // TODO: is there a cleaner way - term.bytes = new byte[termBuffer.length]; - term.offset = 0; - term.copyBytes(termBuffer); - } - - @Override - public BytesRef term() throws IOException { - return term; - } - - @Override - public long ord() throws IOException { - return currentOrd; - } - - @Override - public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUnicodeComparator(); - } - - @Override - public int docFreq() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long totalTermFreq() throws IOException { - return -1; - } - - @Override - public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { - throw new UnsupportedOperationException(); - } - }; - } - } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardNormsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskNormsFormat.java similarity index 70% rename from lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardNormsFormat.java rename to lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskNormsFormat.java index 5834f9c1f71..63a8ab9c3db 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardNormsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskNormsFormat.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.cheapbastard; +package org.apache.lucene.codecs.diskdv; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,25 +22,25 @@ import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.NormsFormat; -import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; /** Norms format that keeps all norms on disk */ -public final class CheapBastardNormsFormat extends NormsFormat { +public final class DiskNormsFormat extends NormsFormat { @Override public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException { - return new DiskDocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + return new Lucene45DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } @Override public DocValuesProducer normsProducer(SegmentReadState state) throws IOException { - return new CheapBastardDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + return new DiskDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } - static final String DATA_CODEC = "CheapBastardNormsData"; - static final String DATA_EXTENSION = "cbnd"; - static final String META_CODEC = "CheapBastardNormsMetadata"; - static final String META_EXTENSION = "cbnm"; + static final String DATA_CODEC = "DiskNormsData"; + static final String DATA_EXTENSION = "dnvd"; + static final String META_CODEC = "DiskNormsMetadata"; + static final String META_EXTENSION = "dnvm"; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java index 1b2726f06fd..3b98449eb5c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java @@ -119,7 +119,7 @@ public abstract class Codec implements NamedSPILoader.NamedSPI { loader.reload(classloader); } - private static Codec defaultCodec = Codec.forName("Lucene42"); + private static Codec defaultCodec = Codec.forName("Lucene45"); /** expert: returns the default codec used for newly created * {@link IndexWriterConfig}s. diff --git a/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java b/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java index fc4728ae846..d97d577de22 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java @@ -21,13 +21,13 @@ package org.apache.lucene.codecs; * A codec that forwards all its method calls to another codec. *

* Extend this class when you need to reuse the functionality of an existing - * codec. For example, if you want to build a codec that redefines Lucene42's + * codec. For example, if you want to build a codec that redefines Lucene45's * {@link LiveDocsFormat}: *

  *   public final class CustomCodec extends FilterCodec {
  *
  *     public CustomCodec() {
- *       super("CustomCodec", new Lucene42Codec());
+ *       super("CustomCodec", new Lucene45Codec());
  *     }
  *
  *     public LiveDocsFormat liveDocsFormat() {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java
index c7911f47dac..0e7ac44d02c 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40Codec.java
@@ -27,7 +27,6 @@ import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.NormsFormat;
 import org.apache.lucene.codecs.StoredFieldsFormat;
 import org.apache.lucene.codecs.TermVectorsFormat;
-import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat;
 import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
 
 /**
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html
index 571b7668c41..ae55e7a63de 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html
@@ -178,7 +178,7 @@ For each field in each document, a value is stored
 that is multiplied into the score for hits on that field.
 
 
  • -{@link org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat Term Vectors}. +{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vectors}. For each field in each document, the term vector (sometimes called document vector) may be stored. A term vector consists of term text and term frequency. To add Term Vectors to your index see the @@ -299,17 +299,17 @@ systems that frequently run out of file handles. Encodes additional scoring factors or other per-document information. -{@link org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat Term Vector Index} +{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Index} .tvx Stores offset into the document data file -{@link org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat Term Vector Documents} +{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Documents} .tvd Contains information about each document that has term vectors -{@link org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat Term Vector Fields} +{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Fields} .tvf The field level info about term vectors diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45Codec.java new file mode 100644 index 00000000000..5ec25e1003c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45Codec.java @@ -0,0 +1,141 @@ +package org.apache.lucene.codecs.lucene45; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat; +import org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat; +import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat; +import org.apache.lucene.codecs.lucene42.Lucene42FieldInfosFormat; +import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; +import org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * Implements the Lucene 4.5 index format, with configurable per-field postings + * and docvalues formats. + *

    + * If you want to reuse functionality of this codec in another codec, extend + * {@link FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene45 package documentation for file format details. + * @lucene.experimental + */ +// NOTE: if we make largish changes in a minor release, easier to just make Lucene46Codec or whatever +// if they are backwards compatible or smallish we can probably do the backwards in the postingsreader +// (it writes a minor version, etc). +public class Lucene45Codec extends Codec { + private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat(); + private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene42FieldInfosFormat(); + private final SegmentInfoFormat infosFormat = new Lucene40SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene40LiveDocsFormat(); + + private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene45Codec.this.getPostingsFormatForField(field); + } + }; + + + private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene45Codec.this.getDocValuesFormatForField(field); + } + }; + + /** Sole constructor. */ + public Lucene45Codec() { + super("Lucene45"); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return fieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return infosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + /** Returns the postings format that should be used for writing + * new segments of field. + * + * The default implementation always returns "Lucene41" + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultFormat; + } + + /** Returns the docvalues format that should be used for writing + * new segments of field. + * + * The default implementation always returns "Lucene45" + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41"); + private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene45"); + + private final NormsFormat normsFormat = new Lucene42NormsFormat(); + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java similarity index 93% rename from lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java index b5124871237..942ee228045 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.diskdv; +package org.apache.lucene.codecs.lucene45; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -37,8 +37,8 @@ import org.apache.lucene.util.packed.BlockPackedWriter; import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; import org.apache.lucene.util.packed.PackedInts; -/** writer for {@link DiskDocValuesFormat} */ -public class DiskDocValuesConsumer extends DocValuesConsumer { +/** writer for {@link Lucene45DocValuesFormat} */ +public class Lucene45DocValuesConsumer extends DocValuesConsumer { static final int BLOCK_SIZE = 16384; static final int ADDRESS_INTERVAL = 16; @@ -60,15 +60,15 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { final IndexOutput data, meta; final int maxDoc; - public DiskDocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + public Lucene45DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { boolean success = false; try { String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); data = state.directory.createOutput(dataName, state.context); - CodecUtil.writeHeader(data, dataCodec, DiskDocValuesFormat.VERSION_CURRENT); + CodecUtil.writeHeader(data, dataCodec, Lucene45DocValuesFormat.VERSION_CURRENT); String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); meta = state.directory.createOutput(metaName, state.context); - CodecUtil.writeHeader(meta, metaCodec, DiskDocValuesFormat.VERSION_CURRENT); + CodecUtil.writeHeader(meta, metaCodec, Lucene45DocValuesFormat.VERSION_CURRENT); maxDoc = state.segmentInfo.getDocCount(); success = true; } finally { @@ -140,7 +140,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { format = DELTA_COMPRESSED; } meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.NUMERIC); + meta.writeByte(Lucene45DocValuesFormat.NUMERIC); meta.writeVInt(format); meta.writeVInt(PackedInts.VERSION_CURRENT); meta.writeLong(data.getFilePointer()); @@ -189,7 +189,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { public void addBinaryField(FieldInfo field, Iterable values) throws IOException { // write the byte[] data meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.BINARY); + meta.writeByte(Lucene45DocValuesFormat.BINARY); int minLength = Integer.MAX_VALUE; int maxLength = Integer.MIN_VALUE; final long startFP = data.getFilePointer(); @@ -242,7 +242,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { } else { // header meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.BINARY); + meta.writeByte(Lucene45DocValuesFormat.BINARY); meta.writeVInt(BINARY_PREFIX_COMPRESSED); // now write the bytes: sharing prefixes within a block final long startFP = data.getFilePointer(); @@ -315,7 +315,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { values = MissingOrdRemapper.insertEmptyValue(values); } meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.SORTED); + meta.writeByte(Lucene45DocValuesFormat.SORTED); addTermsDict(field, values); addNumericField(field, docToOrd, false); } @@ -323,7 +323,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { @Override public void addSortedSetField(FieldInfo field, Iterable values, Iterable docToOrdCount, Iterable ords) throws IOException { meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.SORTED_SET); + meta.writeByte(Lucene45DocValuesFormat.SORTED_SET); // write the ord -> byte[] as a binary field addTermsDict(field, values); // write the stream of ords as a numeric field @@ -332,7 +332,7 @@ public class DiskDocValuesConsumer extends DocValuesConsumer { // write the doc -> ord count as a absolute index to the stream meta.writeVInt(field.number); - meta.writeByte(DiskDocValuesFormat.NUMERIC); + meta.writeByte(Lucene45DocValuesFormat.NUMERIC); meta.writeVInt(DELTA_COMPRESSED); meta.writeVInt(PackedInts.VERSION_CURRENT); meta.writeLong(data.getFilePointer()); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java new file mode 100644 index 00000000000..68a44370b30 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesFormat.java @@ -0,0 +1,167 @@ +package org.apache.lucene.codecs.lucene45; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.FieldInfo.DocValuesType; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.SmallFloat; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.packed.BlockPackedWriter; +import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Lucene 4.5 DocValues format. + *

    + * Encodes the four per-document value types (Numeric,Binary,Sorted,SortedSet) with these strategies: + *

    + * {@link DocValuesType#NUMERIC NUMERIC}: + *

      + *
    • Delta-compressed: per-document integers written in blocks of 16k. For each block + * the minimum value in that block is encoded, and each entry is a delta from that + * minimum value. Each block of deltas is compressed with bitpacking. For more + * information, see {@link BlockPackedWriter}. + *
    • Table-compressed: when the number of unique values is very small (< 256), and + * when there are unused "gaps" in the range of values used (such as {@link SmallFloat}), + * a lookup table is written instead. Each per-document entry is instead the ordinal + * to this table, and those ordinals are compressed with bitpacking ({@link PackedInts}). + *
    • GCD-compressed: when all numbers share a common divisor, such as dates, the greatest + * common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics. + *
    + *

    + * {@link DocValuesType#BINARY BINARY}: + *

      + *
    • Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length. + * Each document's value can be addressed directly with multiplication ({@code docID * length}). + *
    • Variable-width Binary: one large concatenated byte[] is written, along with end addresses + * for each document. The addresses are written in blocks of 16k, with the current absolute + * start for the block, and the average (expected) delta per entry. For each document the + * deviation from the delta (actual - expected) is written. + *
    • Prefix-compressed Binary: nocommit + *
    + *

    + * {@link DocValuesType#SORTED SORTED}: + *

      + *
    • Sorted: an FST mapping deduplicated terms to ordinals is written, along with the per-document + * ordinals written using one of the numeric strategies above. + *
    + *

    + * {@link DocValuesType#SORTED_SET SORTED_SET}: + *

      + *
    • SortedSet: an FST mapping deduplicated terms to ordinals is written, along with the per-document + * ordinal list written using one of the binary strategies above. + *
    + *

    + * Files: + *

      + *
    1. .dvd: DocValues data
    2. + *
    3. .dvm: DocValues metadata
    4. + *
    + *
      + *
    1. + *

      The DocValues metadata or .dvm file.

      + *

      For DocValues field, this stores metadata, such as the offset into the + * DocValues data (.dvd)

      + *

      DocValues metadata (.dvm) --> Header,<FieldNumber,EntryType,Entry>NumFields

      + *
        + *
      • Entry --> NumericEntry | BinaryEntry | SortedEntry
      • + *
      • NumericEntry --> DataOffset,NumericCompressionType,PackedVersion
      • + *
      • BinaryEntry --> DataOffset,DataLength,MinLength,MaxLength,PackedVersion?,BlockSize?
      • + *
      • SortedEntry --> DataOffset,ValueCount
      • + *
      • FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}
      • + *
      • DataOffset,DataLength --> {@link DataOutput#writeLong Int64}
      • + *
      • EntryType,CompressionType --> {@link DataOutput#writeByte Byte}
      • + *
      • Header --> {@link CodecUtil#writeHeader CodecHeader}
      • + *
      + *

      Sorted fields have two entries: a SortedEntry with the FST metadata, + * and an ordinary NumericEntry for the document-to-ord metadata.

      + *

      SortedSet fields have two entries: a SortedEntry with the FST metadata, + * and an ordinary BinaryEntry for the document-to-ord-list metadata.

      + *

      FieldNumber of -1 indicates the end of metadata.

      + *

      EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)

      + *

      DataOffset is the pointer to the start of the data in the DocValues data (.dvd)

      + *

      NumericCompressionType indicates how Numeric values will be compressed: + *

        + *
      • 0 --> delta-compressed. For each block of 16k integers, every integer is delta-encoded + * from the minimum value within the block. + *
      • 1 -->, gcd-compressed. When all integers share a common divisor, only quotients are stored + * using blocks of delta-encoded ints. + *
      • 2 --> table-compressed. When the number of unique numeric values is small and it would save space, + * a lookup table of unique values is written, followed by the ordinal for each document. + *
      + *

      MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. + * If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length). + * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) + * is written for the addresses. + *

    2. + *

      The DocValues data or .dvd file.

      + *

      For DocValues field, this stores the actual per-document data (the heavy-lifting)

      + *

      DocValues data (.dvd) --> Header,<NumericData | BinaryData | SortedData>NumFields

      + *
        + *
      • NumericData --> DeltaCompressedNumerics | TableCompressedNumerics | GCDCompressedNumerics
      • + *
      • BinaryData --> {@link DataOutput#writeByte Byte}DataLength,Addresses
      • + *
      • SortedData --> {@link FST FST<Int64>}
      • + *
      • DeltaCompressedNumerics --> {@link BlockPackedWriter BlockPackedInts(blockSize=16k)}
      • + *
      • TableCompressedNumerics --> TableSize,{@link DataOutput#writeLong Int64}TableSize,{@link PackedInts PackedInts}
      • + *
      • GCDCompressedNumerics --> MinValue,GCD,{@link BlockPackedWriter BlockPackedInts(blockSize=16k)}
      • + *
      • Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=16k)}
      • + *
      • TableSize --> {@link DataOutput#writeVInt vInt}
      • + *
      • MinValue --> {@link DataOutput#writeLong Int64}
      • + *
      • GCD --> {@link DataOutput#writeLong Int64}
      • + *
      + *

      SortedSet entries store the list of ordinals in their BinaryData as a + * sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.

      + *
    + * @lucene.experimental + */ +// nocommit: docs are incomplete +public final class Lucene45DocValuesFormat extends DocValuesFormat { + + public Lucene45DocValuesFormat() { + super("Lucene45"); + } + + @Override + public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new Lucene45DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + } + + @Override + public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { + return new Lucene45DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); + } + + public static final String DATA_CODEC = "Lucene45DocValuesData"; + public static final String DATA_EXTENSION = "dvd"; + public static final String META_CODEC = "Lucene45ValuesMetadata"; + public static final String META_EXTENSION = "dvm"; + public static final int VERSION_START = 0; + public static final int VERSION_CURRENT = VERSION_START; + public static final byte NUMERIC = 0; + public static final byte BINARY = 1; + public static final byte SORTED = 2; + public static final byte SORTED_SET = 3; +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java new file mode 100644 index 00000000000..b19a34e169c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java @@ -0,0 +1,755 @@ +package org.apache.lucene.codecs.lucene45; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.DELTA_COMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.GCD_COMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.TABLE_COMPRESSED; + +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_FIXED_UNCOMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED; +import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.BINARY_PREFIX_COMPRESSED; + +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.packed.BlockPackedReader; +import org.apache.lucene.util.packed.MonotonicBlockPackedReader; +import org.apache.lucene.util.packed.PackedInts; + +class Lucene45DocValuesProducer extends DocValuesProducer { + private final Map numerics; + private final Map binaries; + private final Map ords; + private final Map ordIndexes; + private final IndexInput data; + private final int maxDoc; + + // memory-resident structures + private final Map addressInstances = new HashMap(); + private final Map ordIndexInstances = new HashMap(); + + Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + // read in the entries from the metadata file. + IndexInput in = state.directory.openInput(metaName, state.context); + this.maxDoc = state.segmentInfo.getDocCount(); + boolean success = false; + final int version; + try { + version = CodecUtil.checkHeader(in, metaCodec, + Lucene45DocValuesFormat.VERSION_CURRENT, + Lucene45DocValuesFormat.VERSION_CURRENT); + numerics = new HashMap(); + ords = new HashMap(); + ordIndexes = new HashMap(); + binaries = new HashMap(); + readFields(in, state.fieldInfos); + + success = true; + } finally { + if (success) { + IOUtils.close(in); + } else { + IOUtils.closeWhileHandlingException(in); + } + } + + success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.openInput(dataName, state.context); + final int version2 = CodecUtil.checkHeader(data, dataCodec, + Lucene45DocValuesFormat.VERSION_CURRENT, + Lucene45DocValuesFormat.VERSION_CURRENT); + if (version != version2) { + throw new CorruptIndexException("Format versions mismatch"); + } + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this.data); + } + } + } + + private void readFields(IndexInput meta, FieldInfos infos) throws IOException { + int fieldNumber = meta.readVInt(); + while (fieldNumber != -1) { + byte type = meta.readByte(); + if (type == Lucene45DocValuesFormat.NUMERIC) { + numerics.put(fieldNumber, readNumericEntry(meta)); + } else if (type == Lucene45DocValuesFormat.BINARY) { + BinaryEntry b = readBinaryEntry(meta); + binaries.put(fieldNumber, b); + } else if (type == Lucene45DocValuesFormat.SORTED) { + // sorted = binary + numeric + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + if (meta.readByte() != Lucene45DocValuesFormat.BINARY) { + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + BinaryEntry b = readBinaryEntry(meta); + binaries.put(fieldNumber, b); + + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + NumericEntry n = readNumericEntry(meta); + ords.put(fieldNumber, n); + } else if (type == Lucene45DocValuesFormat.SORTED_SET) { + // sortedset = binary + numeric + ordIndex + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + if (meta.readByte() != Lucene45DocValuesFormat.BINARY) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + BinaryEntry b = readBinaryEntry(meta); + binaries.put(fieldNumber, b); + + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + NumericEntry n1 = readNumericEntry(meta); + ords.put(fieldNumber, n1); + + if (meta.readVInt() != fieldNumber) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { + throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); + } + NumericEntry n2 = readNumericEntry(meta); + ordIndexes.put(fieldNumber, n2); + } else { + throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta); + } + fieldNumber = meta.readVInt(); + } + } + + static NumericEntry readNumericEntry(IndexInput meta) throws IOException { + NumericEntry entry = new NumericEntry(); + entry.format = meta.readVInt(); + entry.packedIntsVersion = meta.readVInt(); + entry.offset = meta.readLong(); + entry.count = meta.readVLong(); + entry.blockSize = meta.readVInt(); + switch(entry.format) { + case GCD_COMPRESSED: + entry.minValue = meta.readLong(); + entry.gcd = meta.readLong(); + break; + case TABLE_COMPRESSED: + if (entry.count > Integer.MAX_VALUE) { + throw new CorruptIndexException("Cannot use TABLE_COMPRESSED with more than MAX_VALUE values, input=" + meta); + } + final int uniqueValues = meta.readVInt(); + if (uniqueValues > 256) { + throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + meta); + } + entry.table = new long[uniqueValues]; + for (int i = 0; i < uniqueValues; ++i) { + entry.table[i] = meta.readLong(); + } + break; + case DELTA_COMPRESSED: + break; + default: + throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); + } + return entry; + } + + static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException { + BinaryEntry entry = new BinaryEntry(); + entry.format = meta.readVInt(); + entry.minLength = meta.readVInt(); + entry.maxLength = meta.readVInt(); + entry.count = meta.readVLong(); + entry.offset = meta.readLong(); + switch(entry.format) { + case BINARY_FIXED_UNCOMPRESSED: + break; + case BINARY_PREFIX_COMPRESSED: + entry.addressInterval = meta.readVInt(); + entry.addressesOffset = meta.readLong(); + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); + break; + case BINARY_VARIABLE_UNCOMPRESSED: + entry.addressesOffset = meta.readLong(); + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); + break; + default: + throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); + } + return entry; + } + + @Override + public NumericDocValues getNumeric(FieldInfo field) throws IOException { + NumericEntry entry = numerics.get(field.number); + return getNumeric(entry); + } + + LongNumericDocValues getNumeric(NumericEntry entry) throws IOException { + final IndexInput data = this.data.clone(); + data.seek(entry.offset); + + switch (entry.format) { + case DELTA_COMPRESSED: + final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); + return new LongNumericDocValues() { + @Override + public long get(long id) { + return reader.get(id); + } + }; + case GCD_COMPRESSED: + final long min = entry.minValue; + final long mult = entry.gcd; + final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); + return new LongNumericDocValues() { + @Override + public long get(long id) { + return min + mult * quotientReader.get(id); + } + }; + case TABLE_COMPRESSED: + final long table[] = entry.table; + final int bitsRequired = PackedInts.bitsRequired(table.length - 1); + final PackedInts.Reader ords = PackedInts.getDirectReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, (int) entry.count, bitsRequired); + return new LongNumericDocValues() { + @Override + public long get(long id) { + return table[(int) ords.get((int) id)]; + } + }; + default: + throw new AssertionError(); + } + } + + @Override + public BinaryDocValues getBinary(FieldInfo field) throws IOException { + BinaryEntry bytes = binaries.get(field.number); + switch(bytes.format) { + case BINARY_FIXED_UNCOMPRESSED: + return getFixedBinary(field, bytes); + case BINARY_VARIABLE_UNCOMPRESSED: + return getVariableBinary(field, bytes); + case BINARY_PREFIX_COMPRESSED: + return getCompressedBinary(field, bytes); + default: + throw new AssertionError(); + } + } + + private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) { + final IndexInput data = this.data.clone(); + + return new LongBinaryDocValues() { + @Override + public void get(long id, BytesRef result) { + long address = bytes.offset + id * bytes.maxLength; + try { + data.seek(address); + // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) + // assume "they" own the bytes after calling this! + final byte[] buffer = new byte[bytes.maxLength]; + data.readBytes(buffer, 0, buffer.length); + result.bytes = buffer; + result.offset = 0; + result.length = buffer.length; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final IndexInput data = this.data.clone(); + + final MonotonicBlockPackedReader addresses; + synchronized (addressInstances) { + MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number); + if (addrInstance == null) { + data.seek(bytes.addressesOffset); + addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, false); + addressInstances.put(field.number, addrInstance); + } + addresses = addrInstance; + } + + return new LongBinaryDocValues() { + @Override + public void get(long id, BytesRef result) { + long startAddress = bytes.offset + (id == 0 ? 0 : addresses.get(id-1)); + long endAddress = bytes.offset + addresses.get(id); + int length = (int) (endAddress - startAddress); + try { + data.seek(startAddress); + // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) + // assume "they" own the bytes after calling this! + final byte[] buffer = new byte[length]; + data.readBytes(buffer, 0, buffer.length); + result.bytes = buffer; + result.offset = 0; + result.length = length; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }; + } + + private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final IndexInput data = this.data.clone(); + final long interval = bytes.addressInterval; + + final MonotonicBlockPackedReader addresses; + synchronized (addressInstances) { + MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number); + if (addrInstance == null) { + data.seek(bytes.addressesOffset); + final long size; + if (bytes.count % interval == 0) { + size = bytes.count / interval; + } else { + size = 1L + bytes.count / interval; + } + addrInstance = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, size, false); + addressInstances.put(field.number, addrInstance); + } + addresses = addrInstance; + } + + return new CompressedBinaryDocValues(bytes, addresses, data); + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + final int valueCount = (int) binaries.get(field.number).count; + final BinaryDocValues binary = getBinary(field); + NumericEntry entry = ords.get(field.number); + IndexInput data = this.data.clone(); + data.seek(entry.offset); + final BlockPackedReader ordinals = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); + + return new SortedDocValues() { + + @Override + public int getOrd(int docID) { + return (int) ordinals.get(docID); + } + + @Override + public void lookupOrd(int ord, BytesRef result) { + binary.get(ord, result); + } + + @Override + public int getValueCount() { + return valueCount; + } + + @Override + public int lookupTerm(BytesRef key) { + if (binary instanceof CompressedBinaryDocValues) { + return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key); + } else { + return super.lookupTerm(key); + } + } + + @Override + public TermsEnum termsEnum() { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues)binary).getTermsEnum(); + } else { + return super.termsEnum(); + } + } + }; + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + final long valueCount = binaries.get(field.number).count; + // we keep the byte[]s and list of ords on disk, these could be large + final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); + final LongNumericDocValues ordinals = getNumeric(ords.get(field.number)); + // but the addresses to the ord stream are in RAM + final MonotonicBlockPackedReader ordIndex; + synchronized (ordIndexInstances) { + MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number); + if (ordIndexInstance == null) { + NumericEntry entry = ordIndexes.get(field.number); + IndexInput data = this.data.clone(); + data.seek(entry.offset); + ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false); + ordIndexInstances.put(field.number, ordIndexInstance); + } + ordIndex = ordIndexInstance; + } + + return new SortedSetDocValues() { + long offset; + long endOffset; + + @Override + public long nextOrd() { + if (offset == endOffset) { + return NO_MORE_ORDS; + } else { + long ord = ordinals.get(offset); + offset++; + return ord; + } + } + + @Override + public void setDocument(int docID) { + offset = (docID == 0 ? 0 : ordIndex.get(docID-1)); + endOffset = ordIndex.get(docID); + } + + @Override + public void lookupOrd(long ord, BytesRef result) { + binary.get(ord, result); + } + + @Override + public long getValueCount() { + return valueCount; + } + + @Override + public long lookupTerm(BytesRef key) { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues)binary).lookupTerm(key); + } else { + return super.lookupTerm(key); + } + } + + @Override + public TermsEnum termsEnum() { + if (binary instanceof CompressedBinaryDocValues) { + return ((CompressedBinaryDocValues)binary).getTermsEnum(); + } else { + return super.termsEnum(); + } + } + }; + } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + // nocommit: only use this if the field's entry has missing values (write that), + // otherwise return MatchAllBits + if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) { + return new SortedSetDocsWithField(getSortedSet(field), maxDoc); + } else { + return new Bits.MatchAllBits(maxDoc); + } + } + + @Override + public void close() throws IOException { + data.close(); + } + + static class NumericEntry { + long offset; + + int format; + int packedIntsVersion; + long count; + int blockSize; + + long minValue; + long gcd; + long table[]; + } + + static class BinaryEntry { + long offset; + + int format; + long count; + int minLength; + int maxLength; + long addressesOffset; + long addressInterval; + int packedIntsVersion; + int blockSize; + } + + // internally we compose complex dv (sorted/sortedset) from other ones + static abstract class LongNumericDocValues extends NumericDocValues { + @Override + public final long get(int docID) { + return get((long) docID); + } + + abstract long get(long id); + } + + static abstract class LongBinaryDocValues extends BinaryDocValues { + @Override + public final void get(int docID, BytesRef result) { + get((long)docID, result); + } + + abstract void get(long id, BytesRef Result); + } + + // in the compressed case, we add a few additional operations for + // more efficient reverse lookup and enumeration + static class CompressedBinaryDocValues extends LongBinaryDocValues { + final BinaryEntry bytes; + final long interval; + final long numValues; + final long numIndexValues; + final MonotonicBlockPackedReader addresses; + final IndexInput data; + final TermsEnum termsEnum; + + public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, IndexInput data) throws IOException { + this.bytes = bytes; + this.interval = bytes.addressInterval; + this.addresses = addresses; + this.data = data; + this.numValues = bytes.count; + this.numIndexValues = addresses.size(); + this.termsEnum = getTermsEnum(data); + } + + @Override + public void get(long id, BytesRef result) { + try { + termsEnum.seekExact(id); + BytesRef term = termsEnum.term(); + result.bytes = term.bytes; + result.offset = term.offset; + result.length = term.length; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + long lookupTerm(BytesRef key) { + try { + SeekStatus status = termsEnum.seekCeil(key); + if (status == SeekStatus.END) { + return -numValues-1; + } else if (status == SeekStatus.FOUND) { + return termsEnum.ord(); + } else { + return -termsEnum.ord()-1; + } + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + TermsEnum getTermsEnum() { + try { + return getTermsEnum(data.clone()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private TermsEnum getTermsEnum(final IndexInput input) throws IOException { + input.seek(bytes.offset); + + return new TermsEnum() { + private long currentOrd = -1; + // TODO: maxLength is negative when all terms are merged away... + private final BytesRef termBuffer = new BytesRef(bytes.maxLength < 0 ? 0 : bytes.maxLength); + private final BytesRef term = new BytesRef(); // TODO: paranoia? + + @Override + public BytesRef next() throws IOException { + if (doNext() == null) { + return null; + } else { + setTerm(); + return term; + } + } + + private BytesRef doNext() throws IOException { + if (++currentOrd >= numValues) { + return null; + } else { + int start = input.readVInt(); + int suffix = input.readVInt(); + input.readBytes(termBuffer.bytes, start, suffix); + termBuffer.length = start + suffix; + return termBuffer; + } + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + // binary-search just the index values to find the block, + // then scan within the block + long low = 0; + long high = numIndexValues-1; + + while (low <= high) { + long mid = (low + high) >>> 1; + doSeek(mid * interval); + int cmp = termBuffer.compareTo(text); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + // we got lucky, found an indexed term + setTerm(); + return SeekStatus.FOUND; + } + } + + if (numIndexValues == 0) { + return SeekStatus.END; + } + + // block before insertion point + long block = low-1; + doSeek(block < 0 ? -1 : block * interval); + + while (doNext() != null) { + int cmp = termBuffer.compareTo(text); + if (cmp == 0) { + setTerm(); + return SeekStatus.FOUND; + } else if (cmp > 0) { + setTerm(); + return SeekStatus.NOT_FOUND; + } + } + + return SeekStatus.END; + } + + @Override + public void seekExact(long ord) throws IOException { + doSeek(ord); + setTerm(); + } + + private void doSeek(long ord) throws IOException { + long block = ord / interval; + + if (ord >= currentOrd && block == currentOrd / interval) { + // seek within current block + } else { + // position before start of block + currentOrd = ord - ord % interval - 1; + input.seek(bytes.offset + addresses.get(block)); + } + + while (currentOrd < ord) { + doNext(); + } + } + + private void setTerm() { + // TODO: is there a cleaner way + term.bytes = new byte[termBuffer.length]; + term.offset = 0; + term.copyBytes(termBuffer); + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + public long ord() throws IOException { + return currentOrd; + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() throws IOException { + return -1; + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + }; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/package.html b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/package.html new file mode 100644 index 00000000000..677c176a89b --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/package.html @@ -0,0 +1,396 @@ + + + + + + + +Lucene 4.5 file format. + +

    Apache Lucene - Index File Formats

    + + +

    Introduction

    +
    +

    This document defines the index file formats used in this version of Lucene. +If you are using a different version of Lucene, please consult the copy of +docs/ that was distributed with +the version you are using.

    +

    Apache Lucene is written in Java, but several efforts are underway to write +versions of +Lucene in other programming languages. If these versions are to remain +compatible with Apache Lucene, then a language-independent definition of the +Lucene index format is required. This document thus attempts to provide a +complete and independent definition of the Apache Lucene file formats.

    +

    As Lucene evolves, this document should evolve. Versions of Lucene in +different programming languages should endeavor to agree on file formats, and +generate new versions of this document.

    +
    + +

    Definitions

    +
    +

    The fundamental concepts in Lucene are index, document, field and term.

    +

    An index contains a sequence of documents.

    +
      +
    • A document is a sequence of fields.
    • +
    • A field is a named sequence of terms.
    • +
    • A term is a sequence of bytes.
    • +
    +

    The same sequence of bytes in two different fields is considered a different +term. Thus terms are represented as a pair: the string naming the field, and the +bytes within the field.

    + +

    Inverted Indexing

    +

    The index stores statistics about terms in order to make term-based search +more efficient. Lucene's index falls into the family of indexes known as an +inverted index. This is because it can list, for a term, the documents +that contain it. This is the inverse of the natural relationship, in which +documents list terms.

    + +

    Types of Fields

    +

    In Lucene, fields may be stored, in which case their text is stored +in the index literally, in a non-inverted manner. Fields that are inverted are +called indexed. A field may be both stored and indexed.

    +

    The text of a field may be tokenized into terms to be indexed, or the +text of a field may be used literally as a term to be indexed. Most fields are +tokenized, but sometimes it is useful for certain identifier fields to be +indexed literally.

    +

    See the {@link org.apache.lucene.document.Field Field} +java docs for more information on Fields.

    + +

    Segments

    +

    Lucene indexes may be composed of multiple sub-indexes, or segments. +Each segment is a fully independent index, which could be searched separately. +Indexes evolve by:

    +
      +
    1. Creating new segments for newly added documents.
    2. +
    3. Merging existing segments.
    4. +
    +

    Searches may involve multiple segments and/or multiple indexes, each index +potentially composed of a set of segments.

    + +

    Document Numbers

    +

    Internally, Lucene refers to documents by an integer document number. +The first document added to an index is numbered zero, and each subsequent +document added gets a number one greater than the previous.

    +

    Note that a document's number may change, so caution should be taken when +storing these numbers outside of Lucene. In particular, numbers may change in +the following situations:

    +
      +
    • +

      The numbers stored in each segment are unique only within the segment, and +must be converted before they can be used in a larger context. The standard +technique is to allocate each segment a range of values, based on the range of +numbers used in that segment. To convert a document number from a segment to an +external value, the segment's base document number is added. To convert +an external value back to a segment-specific value, the segment is identified +by the range that the external value is in, and the segment's base value is +subtracted. For example two five document segments might be combined, so that +the first segment has a base value of zero, and the second of five. Document +three from the second segment would have an external value of eight.

      +
    • +
    • +

      When documents are deleted, gaps are created in the numbering. These are +eventually removed as the index evolves through merging. Deleted documents are +dropped when segments are merged. A freshly-merged segment thus has no gaps in +its numbering.

      +
    • +
    +
    + +

    Index Structure Overview

    +
    +

    Each segment index maintains the following:

    +
      +
    • +{@link org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat Segment info}. + This contains metadata about a segment, such as the number of documents, + what files it uses, +
    • +
    • +{@link org.apache.lucene.codecs.lucene42.Lucene42FieldInfosFormat Field names}. + This contains the set of field names used in the index. +
    • +
    • +{@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Stored Field values}. +This contains, for each document, a list of attribute-value pairs, where the attributes +are field names. These are used to store auxiliary information about the document, such as +its title, url, or an identifier to access a database. The set of stored fields are what is +returned for each hit when searching. This is keyed by document number. +
    • +
    • +{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term dictionary}. +A dictionary containing all of the terms used in all of the +indexed fields of all of the documents. The dictionary also contains the number +of documents which contain the term, and pointers to the term's frequency and +proximity data. +
    • +
    • +{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Frequency data}. +For each term in the dictionary, the numbers of all the +documents that contain that term, and the frequency of the term in that +document, unless frequencies are omitted (IndexOptions.DOCS_ONLY) +
    • +
    • +{@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Proximity data}. +For each term in the dictionary, the positions that the +term occurs in each document. Note that this will not exist if all fields in +all documents omit position data. +
    • +
    • +{@link org.apache.lucene.codecs.lucene42.Lucene42NormsFormat Normalization factors}. +For each field in each document, a value is stored +that is multiplied into the score for hits on that field. +
    • +
    • +{@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vectors}. +For each field in each document, the term vector (sometimes +called document vector) may be stored. A term vector consists of term text and +term frequency. To add Term Vectors to your index see the +{@link org.apache.lucene.document.Field Field} constructors +
    • +
    • +{@link org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat Per-document values}. +Like stored values, these are also keyed by document +number, but are generally intended to be loaded into main memory for fast +access. Whereas stored values are generally intended for summary results from +searches, per-document values are useful for things like scoring factors. +
    • +
    • +{@link org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat Deleted documents}. +An optional file indicating which documents are deleted. +
    • +
    +

    Details on each of these are provided in their linked pages.

    +
    + +

    File Naming

    +
    +

    All files belonging to a segment have the same name with varying extensions. +The extensions correspond to the different file formats described below. When +using the Compound File format (default in 1.4 and greater) these files (except +for the Segment info file, the Lock file, and Deleted documents file) are collapsed +into a single .cfs file (see below for details)

    +

    Typically, all segments in an index are stored in a single directory, +although this is not required.

    +

    As of version 2.1 (lock-less commits), file names are never re-used (there +is one exception, "segments.gen", see below). That is, when any file is saved +to the Directory it is given a never before used filename. This is achieved +using a simple generations approach. For example, the first segments file is +segments_1, then segments_2, etc. The generation is a sequential long integer +represented in alpha-numeric (base 36) form.

    +
    + +

    Summary of File Extensions

    +
    +

    The following table summarizes the names and extensions of the files in +Lucene:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameExtensionBrief Description
    {@link org.apache.lucene.index.SegmentInfos Segments File}segments.gen, segments_NStores information about a commit point
    Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same +file.
    {@link org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat Segment Info}.siStores metadata about a segment
    {@link org.apache.lucene.store.CompoundFileDirectory Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for +systems that frequently run out of file handles.
    {@link org.apache.lucene.codecs.lucene42.Lucene42FieldInfosFormat Fields}.fnmStores information about the fields
    {@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Field Index}.fdxContains pointers to field data
    {@link org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat Field Data}.fdtThe stored fields for documents
    {@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
    {@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Term Index}.tipThe index into the Term Dictionary
    {@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
    {@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Positions}.posStores position information about where a term occurs in the index
    {@link org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
    {@link org.apache.lucene.codecs.lucene42.Lucene42NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
    {@link org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
    {@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
    {@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Documents}.tvdContains information about each document that has term vectors
    {@link org.apache.lucene.codecs.lucene42.Lucene42TermVectorsFormat Term Vector Fields}.tvfThe field level info about term vectors
    {@link org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat Deleted Documents}.delInfo about what files are deleted
    +
    + +

    Lock File

    +The write lock, which is stored in the index directory by default, is named +"write.lock". If the lock directory is different from the index directory then +the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix +derived from the full path to the index directory. When this file is present, a +writer is currently modifying the index (adding or removing documents). This +lock file ensures that only one writer is modifying the index at a time.

    + +

    History

    +

    Compatibility notes are provided in this document, describing how file +formats have changed from prior versions:

    +
      +
    • In version 2.1, the file format was changed to allow lock-less commits (ie, +no more commit lock). The change is fully backwards compatible: you can open a +pre-2.1 index for searching or adding/deleting of docs. When the new segments +file is saved (committed), it will be written in the new file format (meaning +no specific "upgrade" process is needed). But note that once a commit has +occurred, pre-2.1 Lucene will not be able to read the index.
    • +
    • In version 2.3, the file format was changed to allow segments to share a +single set of doc store (vectors & stored fields) files. This allows for +faster indexing in certain cases. The change is fully backwards compatible (in +the same way as the lock-less commits change in 2.1).
    • +
    • In version 2.4, Strings are now written as true UTF-8 byte sequence, not +Java's modified UTF-8. See +LUCENE-510 for details.
    • +
    • In version 2.9, an optional opaque Map<String,String> CommitUserData +may be passed to IndexWriter's commit methods (and later retrieved), which is +recorded in the segments_N file. See +LUCENE-1382 for details. Also, +diagnostics were added to each segment written recording details about why it +was written (due to flush, merge; which OS/JRE was used; etc.). See issue +LUCENE-1654 for details.
    • +
    • In version 3.0, compressed fields are no longer written to the index (they +can still be read, but on merge the new segment will write them, uncompressed). +See issue LUCENE-1960 +for details.
    • +
    • In version 3.1, segments records the code version that created them. See +LUCENE-2720 for details. +Additionally segments track explicitly whether or not they have term vectors. +See LUCENE-2811 +for details.
    • +
    • In version 3.2, numeric fields are written as natively to stored fields +file, previously they were stored in text format only.
    • +
    • In version 3.4, fields can omit position data while still indexing term +frequencies.
    • +
    • In version 4.0, the format of the inverted index became extensible via +the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage +({@code DocValues}) was introduced. Normalization factors need no longer be a +single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. +Terms need not be unicode strings, they can be any byte sequence. Term offsets +can optionally be indexed into the postings lists. Payloads can be stored in the +term vectors.
    • +
    • In version 4.1, the format of the postings list changed to use either +of FOR compression or variable-byte encoding, depending upon the frequency +of the term. Terms appearing only once were changed to inline directly into +the term dictionary. Stored fields are compressed by default.
    • +
    • In version 4.2, term vectors are compressed by default. DocValues has +a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining +on multi-valued fields.
    • +
    • In version 4.5, DocValues were extended to explicitly represent missing values.
    • +
    + +

    Limitations

    +
    +

    Lucene uses a Java int to refer to +document numbers, and the index file format uses an Int32 +on-disk to store document numbers. This is a limitation +of both the index file format and the current implementation. Eventually these +should be replaced with either UInt64 values, or +better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.

    +
    + + diff --git a/lucene/core/src/java/org/apache/lucene/codecs/package.html b/lucene/core/src/java/org/apache/lucene/codecs/package.html index f0f12b42b75..16f53b5cdde 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/package.html +++ b/lucene/core/src/java/org/apache/lucene/codecs/package.html @@ -61,9 +61,13 @@ name of your codec. If you just want to customise the {@link org.apache.lucene.codecs.PostingsFormat}, or use different postings formats for different fields, then you can register your custom postings format in the same way (in META-INF/services/org.apache.lucene.codecs.PostingsFormat), and then extend the default - {@link org.apache.lucene.codecs.lucene42.Lucene42Codec} and override - {@link org.apache.lucene.codecs.lucene42.Lucene42Codec#getPostingsFormatForField(String)} to return your custom + {@link org.apache.lucene.codecs.lucene45.Lucene45Codec} and override + {@link org.apache.lucene.codecs.lucene45.Lucene45Codec#getPostingsFormatForField(String)} to return your custom postings format.

    +

    + Similarly, if you just want to customise the {@link org.apache.lucene.codecs.DocValuesFormat} per-field, have + a look at {@link org.apache.lucene.codecs.lucene45.Lucene45Codec#getDocValuesFormatForField(String)}. +

    diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 5a702d29e39..d0f7bed509e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -1410,7 +1410,7 @@ public class CheckIndex { private static void checkNumericDocValues(String fieldName, AtomicReader reader, NumericDocValues ndv, Bits docsWithField) { for (int i = 0; i < reader.maxDoc(); i++) { long value = ndv.get(i); - if (docsWithField.get(i) == false && value > 0) { + if (docsWithField.get(i) == false && value != 0) { throw new RuntimeException("dv for field: " + fieldName + " is marked missing but has value=" + value + " for doc: " + i); } } diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 8ae12c55368..3398b343d49 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -16,3 +16,4 @@ org.apache.lucene.codecs.lucene40.Lucene40Codec org.apache.lucene.codecs.lucene41.Lucene41Codec org.apache.lucene.codecs.lucene42.Lucene42Codec +org.apache.lucene.codecs.lucene45.Lucene45Codec \ No newline at end of file diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat index c0e599e3611..262f8a2bdd5 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -14,3 +14,4 @@ # limitations under the License. org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat +org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat \ No newline at end of file diff --git a/lucene/core/src/test/org/apache/lucene/TestExternalCodecs.java b/lucene/core/src/test/org/apache/lucene/TestExternalCodecs.java index d973fb5c67c..df66d801833 100644 --- a/lucene/core/src/test/org/apache/lucene/TestExternalCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/TestExternalCodecs.java @@ -17,21 +17,27 @@ package org.apache.lucene; * limitations under the License. */ -import org.apache.lucene.analysis.*; -import org.apache.lucene.codecs.*; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; -import org.apache.lucene.document.*; -import org.apache.lucene.index.*; -import org.apache.lucene.search.*; -import org.apache.lucene.store.*; -import org.apache.lucene.util.*; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.BaseDirectoryWrapper; +import org.apache.lucene.util.LuceneTestCase; + /* Intentionally outside of oal.index to verify fully external codecs work fine */ public class TestExternalCodecs extends LuceneTestCase { - private static final class CustomPerFieldCodec extends Lucene42Codec { + private static final class CustomPerFieldCodec extends Lucene45Codec { private final PostingsFormat ramFormat = PostingsFormat.forName("RAMOnly"); private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41"); diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestCheapBastardDocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene45/TestLucene45DocValuesFormat.java similarity index 79% rename from lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestCheapBastardDocValuesFormat.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene45/TestLucene45DocValuesFormat.java index 4cd40c5cd4e..3f6171acd62 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/diskdv/TestCheapBastardDocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene45/TestLucene45DocValuesFormat.java @@ -1,4 +1,4 @@ -package org.apache.lucene.codecs.diskdv; +package org.apache.lucene.codecs.lucene45; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -18,15 +18,14 @@ package org.apache.lucene.codecs.diskdv; */ import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.cheapbastard.CheapBastardDocValuesFormat; import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase; import org.apache.lucene.util._TestUtil; /** - * Tests CheapBastardDocValuesFormat + * Tests Lucene45DocValuesFormat */ -public class TestCheapBastardDocValuesFormat extends BaseCompressingDocValuesFormatTestCase { - private final Codec codec = _TestUtil.alwaysDocValuesFormat(new CheapBastardDocValuesFormat()); +public class TestLucene45DocValuesFormat extends BaseCompressingDocValuesFormatTestCase { + private final Codec codec = _TestUtil.alwaysDocValuesFormat(new Lucene45DocValuesFormat()); @Override protected Codec getCodec() { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java index 06795f005c6..43ac931c16f 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java @@ -25,7 +25,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -79,9 +79,9 @@ public class TestPerFieldDocValuesFormat extends BaseDocValuesFormatTestCase { Directory directory = newDirectory(); // we don't use RandomIndexWriter because it might add more docvalues than we expect !!!!1 IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); - final DocValuesFormat fast = DocValuesFormat.forName("Lucene42"); + final DocValuesFormat fast = DocValuesFormat.forName("Lucene45"); final DocValuesFormat slow = DocValuesFormat.forName("SimpleText"); - iwc.setCodec(new Lucene42Codec() { + iwc.setCodec(new Lucene45Codec() { @Override public DocValuesFormat getDocValuesFormatForField(String field) { if ("dv1".equals(field)) { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java index 8d9f8838fba..4bfc8115ef6 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat2.java @@ -21,7 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; import org.apache.lucene.codecs.mocksep.MockSepPostingsFormat; import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat; @@ -200,7 +200,7 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase { } - public static class MockCodec extends Lucene42Codec { + public static class MockCodec extends Lucene45Codec { final PostingsFormat lucene40 = new Lucene41PostingsFormat(); final PostingsFormat simpleText = new SimpleTextPostingsFormat(); final PostingsFormat mockSep = new MockSepPostingsFormat(); @@ -217,7 +217,7 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase { } } - public static class MockCodec2 extends Lucene42Codec { + public static class MockCodec2 extends Lucene45Codec { final PostingsFormat lucene40 = new Lucene41PostingsFormat(); final PostingsFormat simpleText = new SimpleTextPostingsFormat(); @@ -268,7 +268,7 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase { } public void testSameCodecDifferentInstance() throws Exception { - Codec codec = new Lucene42Codec() { + Codec codec = new Lucene45Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { if ("id".equals(field)) { @@ -284,7 +284,7 @@ public class TestPerFieldPostingsFormat2 extends LuceneTestCase { } public void testSameCodecDifferentParams() throws Exception { - Codec codec = new Lucene42Codec() { + Codec codec = new Lucene45Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { if ("id".equals(field)) { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java index d0789dca5e3..d0ea942b959 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -28,7 +28,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -1060,7 +1060,7 @@ public class TestAddIndexes extends LuceneTestCase { aux2.close(); } - private static final class CustomPerFieldCodec extends Lucene42Codec { + private static final class CustomPerFieldCodec extends Lucene45Codec { private final PostingsFormat simpleTextFormat = PostingsFormat.forName("SimpleText"); private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41"); private final PostingsFormat mockSepFormat = PostingsFormat.forName("MockSep"); @@ -1111,7 +1111,7 @@ public class TestAddIndexes extends LuceneTestCase { private static final class UnRegisteredCodec extends FilterCodec { public UnRegisteredCodec() { - super("NotRegistered", new Lucene42Codec()); + super("NotRegistered", new Lucene45Codec()); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java index 9a9f75e47d6..784301783e1 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestAllFilesHaveCodecHeader.java @@ -21,7 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexFileNames; @@ -41,7 +41,7 @@ public class TestAllFilesHaveCodecHeader extends LuceneTestCase { public void test() throws Exception { Directory dir = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); - conf.setCodec(new Lucene42Codec()); + conf.setCodec(new Lucene45Codec()); // riw should sometimes create docvalues fields, etc RandomIndexWriter riw = new RandomIndexWriter(random(), dir, conf); Document doc = new Document(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java index 7c4f1a68a65..e222258bf0a 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java @@ -49,7 +49,7 @@ public class TestDuelingCodecs extends LuceneTestCase { public void setUp() throws Exception { super.setUp(); - // for now its SimpleText vs Lucene42(random postings format) + // for now its SimpleText vs Lucene45(random postings format) // as this gives the best overall coverage. when we have more // codecs we should probably pick 2 from Codec.availableCodecs() diff --git a/lucene/core/src/test/org/apache/lucene/util/TestNamedSPILoader.java b/lucene/core/src/test/org/apache/lucene/util/TestNamedSPILoader.java index 4f2c51edf00..1724cc26ec0 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestNamedSPILoader.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestNamedSPILoader.java @@ -25,8 +25,8 @@ import org.apache.lucene.codecs.Codec; // enough to test the basics via Codec public class TestNamedSPILoader extends LuceneTestCase { public void testLookup() { - Codec codec = Codec.forName("Lucene42"); - assertEquals("Lucene42", codec.getName()); + Codec codec = Codec.forName("Lucene45"); + assertEquals("Lucene45", codec.getName()); } // we want an exception if its not found. @@ -39,6 +39,6 @@ public class TestNamedSPILoader extends LuceneTestCase { public void testAvailableServices() { Set codecs = Codec.availableCodecs(); - assertTrue(codecs.contains("Lucene42")); + assertTrue(codecs.contains("Lucene45")); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42Codec.java b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42Codec.java index 4987062afc0..f8984770a41 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42Codec.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42Codec.java @@ -42,6 +42,7 @@ import org.apache.lucene.facet.params.FacetIndexingParams; * * @lucene.experimental */ +// nocommit public class Facet42Codec extends Lucene42Codec { private final Set facetFields; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java index e08899c5d01..79a78e0acfd 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42DocValuesConsumer.java @@ -68,7 +68,9 @@ public class Facet42DocValuesConsumer extends DocValuesConsumer { long totBytes = 0; for (BytesRef v : values) { - totBytes += v.length; + if (v != null) { + totBytes += v.length; + } } if (totBytes > Integer.MAX_VALUE) { @@ -78,7 +80,9 @@ public class Facet42DocValuesConsumer extends DocValuesConsumer { out.writeVInt((int) totBytes); for (BytesRef v : values) { - out.writeBytes(v.bytes, v.offset, v.length); + if (v != null) { + out.writeBytes(v.bytes, v.offset, v.length); + } } PackedInts.Writer w = PackedInts.getWriter(out, maxDoc+1, PackedInts.bitsRequired(totBytes+1), acceptableOverheadRatio); @@ -86,7 +90,9 @@ public class Facet42DocValuesConsumer extends DocValuesConsumer { int address = 0; for(BytesRef v : values) { w.add(address); - address += v.length; + if (v != null) { + address += v.length; + } } w.add(address); w.finish(); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleFieldSource.java index 8da0ad64224..73c5d497548 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/DoubleFieldSource.java @@ -68,7 +68,7 @@ public class DoubleFieldSource extends FieldCacheSource { @Override public boolean exists(int doc) { - return valid.get(doc); + return arr.get(doc) != 0 || valid.get(doc); } @Override @@ -142,7 +142,7 @@ public class DoubleFieldSource extends FieldCacheSource { @Override public void fillValue(int doc) { mval.value = arr.get(doc); - mval.exists = valid.get(doc); + mval.exists = mval.value != 0 || valid.get(doc); } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatFieldSource.java index da36cfc1360..c81526365c4 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/FloatFieldSource.java @@ -72,7 +72,7 @@ public class FloatFieldSource extends FieldCacheSource { @Override public boolean exists(int doc) { - return valid.get(doc); + return arr.get(doc) != 0 || valid.get(doc); } @Override @@ -88,7 +88,7 @@ public class FloatFieldSource extends FieldCacheSource { @Override public void fillValue(int doc) { mval.value = arr.get(doc); - mval.exists = valid.get(doc); + mval.exists = mval.value != 0 || valid.get(doc); } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IntFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IntFieldSource.java index 296432d6f9a..768d9d77e5a 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IntFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/IntFieldSource.java @@ -95,7 +95,7 @@ public class IntFieldSource extends FieldCacheSource { @Override public boolean exists(int doc) { - return valid.get(doc); + return arr.get(doc) != 0 || valid.get(doc); } @Override @@ -150,7 +150,7 @@ public class IntFieldSource extends FieldCacheSource { @Override public void fillValue(int doc) { mval.value = arr.get(doc); - mval.exists = valid.get(doc); + mval.exists = mval.value != 0 || valid.get(doc); } }; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/LongFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/LongFieldSource.java index 597efe89e97..d1718313d8b 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/LongFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/LongFieldSource.java @@ -81,7 +81,7 @@ public class LongFieldSource extends FieldCacheSource { @Override public boolean exists(int doc) { - return valid.get(doc); + return arr.get(doc) != 0 || valid.get(doc); } @Override @@ -141,7 +141,7 @@ public class LongFieldSource extends FieldCacheSource { @Override public void fillValue(int doc) { mval.value = arr.get(doc); - mval.exists = valid.get(doc); + mval.exists = mval.value != 0 || valid.get(doc); } }; } diff --git a/lucene/site/xsl/index.xsl b/lucene/site/xsl/index.xsl index 9a7235b395b..842f7be3d9d 100644 --- a/lucene/site/xsl/index.xsl +++ b/lucene/site/xsl/index.xsl @@ -75,7 +75,7 @@
  • System Requirements: Minimum and supported Java versions.
  • Migration Guide: What changed in Lucene 4; how to migrate code from Lucene 3.x.
  • JRE Version Migration: Information about upgrading between major JRE versions.
  • -
  • File Formats: Guide to the supported index format used by Lucene. This can be customized by using an alternate codec.
  • +
  • File Formats: Guide to the supported index format used by Lucene. This can be customized by using an alternate codec.
  • Search and Scoring in Lucene: Introduction to how Lucene scores documents.
  • Classic Scoring Formula: Formula of Lucene's classic Vector Space implementation. (look here for other models)
  • Classic QueryParser Syntax: Overview of the Classic QueryParser's syntax and features.
  • diff --git a/lucene/spatial/src/java/org/apache/lucene/spatial/bbox/BBoxSimilarityValueSource.java b/lucene/spatial/src/java/org/apache/lucene/spatial/bbox/BBoxSimilarityValueSource.java index f501f150ddb..2a3b2a7c539 100644 --- a/lucene/spatial/src/java/org/apache/lucene/spatial/bbox/BBoxSimilarityValueSource.java +++ b/lucene/spatial/src/java/org/apache/lucene/spatial/bbox/BBoxSimilarityValueSource.java @@ -78,10 +78,12 @@ public class BBoxSimilarityValueSource extends ValueSource { @Override public float floatVal(int doc) { + double minXVal = minX.get(doc); + double maxXVal = maxX.get(doc); // make sure it has minX and area - if (validMinX.get(doc) && validMaxX.get(doc)) { + if ((minXVal != 0 || validMinX.get(doc)) && (maxXVal != 0 || validMaxX.get(doc))) { rect.reset( - minX.get(doc), maxX.get(doc), + minXVal, maxXVal, minY.get(doc), maxY.get(doc)); return (float) similarity.score(rect, null); } else { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java index 8c8ad170c2c..60f68ead5ec 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java @@ -34,7 +34,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -161,7 +161,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { * codec to use. */ protected IndexWriterConfig getIndexWriterConfig(Version matchVersion, Analyzer indexAnalyzer) { IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, indexAnalyzer); - iwc.setCodec(new Lucene42Codec()); + iwc.setCodec(new Lucene45Codec()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); return iwc; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java index 19cad74151f..9591997229d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingCodec.java @@ -23,10 +23,10 @@ import org.apache.lucene.codecs.NormsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; /** - * Acts like {@link Lucene42Codec} but with additional asserts. + * Acts like {@link Lucene45Codec} but with additional asserts. */ public final class AssertingCodec extends FilterCodec { @@ -37,7 +37,7 @@ public final class AssertingCodec extends FilterCodec { private final NormsFormat norms = new AssertingNormsFormat(); public AssertingCodec() { - super("Asserting", new Lucene42Codec()); + super("Asserting", new Lucene45Codec()); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java index a59e4c58c06..298d7aaf011 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java @@ -24,7 +24,7 @@ import java.util.NoSuchElementException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat; import org.apache.lucene.index.AssertingAtomicReader; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.FieldInfo; @@ -39,10 +39,10 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.OpenBitSet; /** - * Just like {@link Lucene42DocValuesFormat} but with additional asserts. + * Just like {@link Lucene45DocValuesFormat} but with additional asserts. */ public class AssertingDocValuesFormat extends DocValuesFormat { - private final DocValuesFormat in = new Lucene42DocValuesFormat(); + private final DocValuesFormat in = new Lucene45DocValuesFormat(); public AssertingDocValuesFormat() { super("Asserting"); diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java index 8b64401b452..5579af6245f 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java @@ -28,6 +28,7 @@ import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +// nocommit /** * Just like {@link Lucene42NormsFormat} but with additional asserts. */ diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardCodec.java index 644f04becea..15310e09bc4 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardCodec.java @@ -23,10 +23,12 @@ import org.apache.lucene.codecs.NormsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; +import org.apache.lucene.codecs.diskdv.DiskNormsFormat; import org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsFormat; import org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat; import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; /** Codec that tries to use as little ram as possible because he spent all his money on beer */ // TODO: better name :) @@ -39,11 +41,11 @@ public class CheapBastardCodec extends FilterCodec { private final StoredFieldsFormat storedFields = new Lucene40StoredFieldsFormat(); private final TermVectorsFormat termVectors = new Lucene40TermVectorsFormat(); // these go to disk for all docvalues/norms datastructures - private final DocValuesFormat docValues = new CheapBastardDocValuesFormat(); - private final NormsFormat norms = new CheapBastardNormsFormat(); + private final DocValuesFormat docValues = new DiskDocValuesFormat(); + private final NormsFormat norms = new DiskNormsFormat(); public CheapBastardCodec() { - super("CheapBastard", new Lucene42Codec()); + super("CheapBastard", new Lucene45Codec()); } public PostingsFormat postingsFormat() { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java deleted file mode 100644 index 07f152cd6ee..00000000000 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesFormat.java +++ /dev/null @@ -1,74 +0,0 @@ -package org.apache.lucene.codecs.cheapbastard; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.codecs.DocValuesConsumer; -import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.DocValuesFormat; -import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer; -import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.util.BytesRef; - -/** - * DocValues format that keeps everything on disk. - *

    - * Internally there are only 2 field types: - *

      - *
    • BINARY: a big byte[]. - *
    • NUMERIC: packed ints - *
    - * SORTED is encoded as BINARY + NUMERIC - *

    - * NOTE: Don't use this format in production (its not very efficient). - * Most likely you would want some parts in RAM, other parts on disk. - *

    - * @lucene.experimental - */ -public final class CheapBastardDocValuesFormat extends DocValuesFormat { - - public CheapBastardDocValuesFormat() { - super("CheapBastard"); - } - - @Override - public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return new DiskDocValuesConsumer(state, DiskDocValuesFormat.DATA_CODEC, - DiskDocValuesFormat.DATA_EXTENSION, - DiskDocValuesFormat.META_CODEC, - DiskDocValuesFormat.META_EXTENSION) { - // don't ever write an index, we dont want to use RAM :) - @Override - protected void addTermsDict(FieldInfo field, Iterable values) throws IOException { - addBinaryField(field, values); - } - }; - } - - @Override - public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { - return new CheapBastardDocValuesProducer(state, DiskDocValuesFormat.DATA_CODEC, - DiskDocValuesFormat.DATA_EXTENSION, - DiskDocValuesFormat.META_CODEC, - DiskDocValuesFormat.META_EXTENSION); - } -} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java deleted file mode 100644 index f6098dc1f21..00000000000 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/cheapbastard/CheapBastardDocValuesProducer.java +++ /dev/null @@ -1,444 +0,0 @@ -package org.apache.lucene.codecs.cheapbastard; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.DELTA_COMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.GCD_COMPRESSED; -import static org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer.TABLE_COMPRESSED; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.DocValuesProducer.SortedSetDocsWithField; -import org.apache.lucene.codecs.diskdv.DiskDocValuesConsumer; -import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SortedDocValues; -import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.packed.BlockPackedReader; -import org.apache.lucene.util.packed.MonotonicBlockPackedReader; -import org.apache.lucene.util.packed.PackedInts; - -class CheapBastardDocValuesProducer extends DocValuesProducer { - private final Map numerics; - private final Map ords; - private final Map ordIndexes; - private final Map binaries; - private final IndexInput data; - private final int maxDoc; - - CheapBastardDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { - String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); - this.maxDoc = state.segmentInfo.getDocCount(); - // read in the entries from the metadata file. - IndexInput in = state.directory.openInput(metaName, state.context); - boolean success = false; - final int version; - try { - version = CodecUtil.checkHeader(in, metaCodec, - DiskDocValuesFormat.VERSION_CURRENT, - DiskDocValuesFormat.VERSION_CURRENT); - numerics = new HashMap(); - ords = new HashMap(); - ordIndexes = new HashMap(); - binaries = new HashMap(); - readFields(in); - - success = true; - } finally { - if (success) { - IOUtils.close(in); - } else { - IOUtils.closeWhileHandlingException(in); - } - } - - success = false; - try { - String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); - data = state.directory.openInput(dataName, state.context); - final int version2 = CodecUtil.checkHeader(data, dataCodec, - DiskDocValuesFormat.VERSION_CURRENT, - DiskDocValuesFormat.VERSION_CURRENT); - if (version != version2) { - throw new CorruptIndexException("Versions mismatch"); - } - - success = true; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(this.data); - } - } - - } - - private void readFields(IndexInput meta) throws IOException { - int fieldNumber = meta.readVInt(); - while (fieldNumber != -1) { - byte type = meta.readByte(); - if (type == DiskDocValuesFormat.NUMERIC) { - numerics.put(fieldNumber, readNumericEntry(meta)); - } else if (type == DiskDocValuesFormat.BINARY) { - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - } else if (type == DiskDocValuesFormat.SORTED) { - // sorted = binary + numeric - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != DiskDocValuesFormat.BINARY) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - NumericEntry n = readNumericEntry(meta); - ords.put(fieldNumber, n); - } else if (type == DiskDocValuesFormat.SORTED_SET) { - // sortedset = binary + numeric + ordIndex - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != DiskDocValuesFormat.BINARY) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - NumericEntry n1 = readNumericEntry(meta); - ords.put(fieldNumber, n1); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != DiskDocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - NumericEntry n2 = readNumericEntry(meta); - ordIndexes.put(fieldNumber, n2); - } else { - throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta); - } - fieldNumber = meta.readVInt(); - } - } - - static NumericEntry readNumericEntry(IndexInput meta) throws IOException { - NumericEntry entry = new NumericEntry(); - entry.format = meta.readVInt(); - entry.packedIntsVersion = meta.readVInt(); - entry.offset = meta.readLong(); - entry.count = meta.readVLong(); - entry.blockSize = meta.readVInt(); - switch(entry.format) { - case GCD_COMPRESSED: - entry.minValue = meta.readLong(); - entry.gcd = meta.readLong(); - break; - case TABLE_COMPRESSED: - if (entry.count > Integer.MAX_VALUE) { - throw new CorruptIndexException("Cannot use TABLE_COMPRESSED with more than MAX_VALUE values, input=" + meta); - } - final int uniqueValues = meta.readVInt(); - if (uniqueValues > 256) { - throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + meta); - } - entry.table = new long[uniqueValues]; - for (int i = 0; i < uniqueValues; ++i) { - entry.table[i] = meta.readLong(); - } - break; - case DELTA_COMPRESSED: - break; - default: - throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); - } - return entry; - } - - static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException { - BinaryEntry entry = new BinaryEntry(); - int format = meta.readVInt(); - if (format != DiskDocValuesConsumer.BINARY_FIXED_UNCOMPRESSED && format != DiskDocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED) { - throw new CorruptIndexException("Unexpected format for binary entry: " + format + ", input=" + meta); - } - entry.minLength = meta.readVInt(); - entry.maxLength = meta.readVInt(); - entry.count = meta.readVLong(); - entry.offset = meta.readLong(); - if (entry.minLength != entry.maxLength) { - entry.addressesOffset = meta.readLong(); - entry.packedIntsVersion = meta.readVInt(); - entry.blockSize = meta.readVInt(); - } - return entry; - } - - @Override - public NumericDocValues getNumeric(FieldInfo field) throws IOException { - NumericEntry entry = numerics.get(field.number); - return getNumeric(field, entry); - } - - private LongNumericDocValues getNumeric(FieldInfo field, final NumericEntry entry) throws IOException { - final IndexInput data = this.data.clone(); - data.seek(entry.offset); - - switch (entry.format) { - case DELTA_COMPRESSED: - final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - return new LongNumericDocValues() { - @Override - public long get(long id) { - return reader.get(id); - } - }; - case GCD_COMPRESSED: - final long min = entry.minValue; - final long mult = entry.gcd; - final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - return new LongNumericDocValues() { - @Override - public long get(long id) { - return min + mult * quotientReader.get(id); - } - }; - case TABLE_COMPRESSED: - final long[] table = entry.table; - final int bitsRequired = PackedInts.bitsRequired(table.length - 1); - final PackedInts.Reader ords = PackedInts.getDirectReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, (int) entry.count, bitsRequired); - return new LongNumericDocValues() { - @Override - long get(long id) { - return table[(int) ords.get((int) id)]; - } - }; - default: - throw new AssertionError(); - } - } - - @Override - public BinaryDocValues getBinary(FieldInfo field) throws IOException { - BinaryEntry bytes = binaries.get(field.number); - if (bytes.minLength == bytes.maxLength) { - return getFixedBinary(field, bytes); - } else { - return getVariableBinary(field, bytes); - } - } - - private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) { - final IndexInput data = this.data.clone(); - - return new LongBinaryDocValues() { - @Override - public void get(long id, BytesRef result) { - long address = bytes.offset + id * bytes.maxLength; - try { - data.seek(address); - // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) - // assume "they" own the bytes after calling this! - final byte[] buffer = new byte[bytes.maxLength]; - data.readBytes(buffer, 0, buffer.length); - result.bytes = buffer; - result.offset = 0; - result.length = buffer.length; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; - } - - private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final IndexInput data = this.data.clone(); - data.seek(bytes.addressesOffset); - - final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, true); - return new LongBinaryDocValues() { - @Override - public void get(long id, BytesRef result) { - long startAddress = bytes.offset + (id == 0 ? 0 : + addresses.get(id-1)); - long endAddress = bytes.offset + addresses.get(id); - int length = (int) (endAddress - startAddress); - try { - data.seek(startAddress); - // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) - // assume "they" own the bytes after calling this! - final byte[] buffer = new byte[length]; - data.readBytes(buffer, 0, buffer.length); - result.bytes = buffer; - result.offset = 0; - result.length = length; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; - } - - @Override - public SortedDocValues getSorted(FieldInfo field) throws IOException { - final int valueCount = (int) binaries.get(field.number).count; - final BinaryDocValues binary = getBinary(field); - final NumericDocValues ordinals = getNumeric(field, ords.get(field.number)); - return new SortedDocValues() { - - @Override - public int getOrd(int docID) { - return (int) ordinals.get(docID); - } - - @Override - public void lookupOrd(int ord, BytesRef result) { - binary.get(ord, result); - } - - @Override - public int getValueCount() { - return valueCount; - } - }; - } - - @Override - public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - final long valueCount = binaries.get(field.number).count; - final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); - final LongNumericDocValues ordinals = getNumeric(field, ords.get(field.number)); - NumericEntry entry = ordIndexes.get(field.number); - IndexInput data = this.data.clone(); - data.seek(entry.offset); - final MonotonicBlockPackedReader ordIndex = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - - return new SortedSetDocValues() { - long offset; - long endOffset; - - @Override - public long nextOrd() { - if (offset == endOffset) { - return NO_MORE_ORDS; - } else { - long ord = ordinals.get(offset); - offset++; - return ord; - } - } - - @Override - public void setDocument(int docID) { - offset = (docID == 0 ? 0 : ordIndex.get(docID-1)); - endOffset = ordIndex.get(docID); - } - - @Override - public void lookupOrd(long ord, BytesRef result) { - binary.get(ord, result); - } - - @Override - public long getValueCount() { - return valueCount; - } - }; - } - - @Override - public Bits getDocsWithField(FieldInfo field) throws IOException { - if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) { - return new SortedSetDocsWithField(getSortedSet(field), maxDoc); - } else { - return new Bits.MatchAllBits(maxDoc); - } - } - - @Override - public void close() throws IOException { - data.close(); - } - - static class NumericEntry { - long offset; - - int format; - int packedIntsVersion; - long count; - int blockSize; - - long minValue; - long gcd; - long table[]; - } - - static class BinaryEntry { - long offset; - - long count; - int minLength; - int maxLength; - long addressesOffset; - int packedIntsVersion; - int blockSize; - } - - // internally we compose complex dv (sorted/sortedset) from other ones - static abstract class LongNumericDocValues extends NumericDocValues { - @Override - public final long get(int docID) { - return get((long) docID); - } - - abstract long get(long id); - } - - static abstract class LongBinaryDocValues extends BinaryDocValues { - @Override - public final void get(int docID, BytesRef result) { - get((long)docID, result); - } - - abstract void get(long id, BytesRef Result); - } -} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java index 5fa95c4c9e0..bcfd9361e37 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/CompressingCodec.java @@ -23,13 +23,13 @@ import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.compressing.dummy.DummyCompressingCodec; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import com.carrotsearch.randomizedtesting.generators.RandomInts; /** * A codec that uses {@link CompressingStoredFieldsFormat} for its stored - * fields and delegates to {@link Lucene42Codec} for everything else. + * fields and delegates to {@link Lucene45Codec} for everything else. */ public abstract class CompressingCodec extends FilterCodec { @@ -73,7 +73,7 @@ public abstract class CompressingCodec extends FilterCodec { * Creates a compressing codec with a given segment suffix */ public CompressingCodec(String name, String segmentSuffix, CompressionMode compressionMode, int chunkSize) { - super(name, new Lucene42Codec()); + super(name, new Lucene45Codec()); this.storedFieldsFormat = new CompressingStoredFieldsFormat(name, segmentSuffix, compressionMode, chunkSize); this.termVectorsFormat = new CompressingTermVectorsFormat(name, segmentSuffix, compressionMode, chunkSize); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java index 24f41ab2995..8b2ed06e9e3 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java @@ -2,8 +2,8 @@ package org.apache.lucene.codecs.compressing; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.NormsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; -import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; +import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; // nocommit +import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; // nocommit import org.apache.lucene.util.packed.PackedInts; /* diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java index 7c6ba48f91d..5b6385e0d36 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java @@ -6,6 +6,7 @@ import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; import org.apache.lucene.util.packed.PackedInts; +// nocommit /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java index 2f1fc293592..6b4b1091b97 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java @@ -21,6 +21,7 @@ import org.apache.lucene.util.packed.PackedInts; * limitations under the License. */ +// nocommit /** CompressionCodec that uses {@link CompressionMode#HIGH_COMPRESSION} */ public class HighCompressionCompressingCodec extends CompressingCodec { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java index 430948704a2..c25a0f3c764 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseStoredFieldsFormatTestCase.java @@ -32,7 +32,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.compressing.CompressingCodec; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.document.Document; import org.apache.lucene.document.DoubleField; @@ -502,7 +502,7 @@ public abstract class BaseStoredFieldsFormatTestCase extends LuceneTestCase { // get another codec, other than the default: so we are merging segments across different codecs final Codec otherCodec; if ("SimpleText".equals(Codec.getDefault().getName())) { - otherCodec = new Lucene42Codec(); + otherCodec = new Lucene45Codec(); } else { otherCodec = new SimpleTextCodec(); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index 9ecc5122f33..239be0f34c7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -35,10 +35,9 @@ import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; import org.apache.lucene.codecs.lucene41ords.Lucene41WithOrds; import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapDocFreqInterval; import org.apache.lucene.codecs.lucene41vargap.Lucene41VarGapFixedInterval; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; -import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat; import org.apache.lucene.codecs.bloom.TestBloomFilteredLucene41Postings; -import org.apache.lucene.codecs.cheapbastard.CheapBastardDocValuesFormat; import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; import org.apache.lucene.codecs.memory.DirectPostingsFormat; import org.apache.lucene.codecs.memory.MemoryPostingsFormat; @@ -62,7 +61,7 @@ import org.apache.lucene.util._TestUtil; * documents in different orders and the test will still be deterministic * and reproducable. */ -public class RandomCodec extends Lucene42Codec { +public class RandomCodec extends Lucene45Codec { /** Shuffled list of postings formats to use for new mappings */ private List formats = new ArrayList(); @@ -148,11 +147,10 @@ public class RandomCodec extends Lucene42Codec { new MemoryPostingsFormat(false, random.nextFloat())); addDocValues(avoidCodecs, - new Lucene42DocValuesFormat(), + new Lucene45DocValuesFormat(), new DiskDocValuesFormat(), new SimpleTextDocValuesFormat(), - new AssertingDocValuesFormat(), - new CheapBastardDocValuesFormat()); + new AssertingDocValuesFormat()); Collections.shuffle(formats, random); Collections.shuffle(dvFormats, random); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java index ebe0426a18c..1ab8c818b7f 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java @@ -40,6 +40,7 @@ import org.apache.lucene.codecs.lucene40.Lucene40RWCodec; import org.apache.lucene.codecs.lucene40.Lucene40RWPostingsFormat; import org.apache.lucene.codecs.lucene41.Lucene41RWCodec; import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.index.RandomCodec; import org.apache.lucene.search.RandomSimilarityProvider; @@ -146,6 +147,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { savedCodec = Codec.getDefault(); int randomVal = random.nextInt(10); + // nocommit: 4.2 impersonator if ("Lucene40".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && "random".equals(TEST_POSTINGSFORMAT) && "random".equals(TEST_DOCVALUESFORMAT) && @@ -182,7 +184,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { dvFormat = DocValuesFormat.forName(TEST_DOCVALUESFORMAT); } - codec = new Lucene42Codec() { + codec = new Lucene45Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return format; diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index baf09e121a3..d79c948ed7e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -44,7 +44,7 @@ import java.util.zip.ZipFile; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; @@ -703,7 +703,7 @@ public class _TestUtil { if (LuceneTestCase.VERBOSE) { System.out.println("forcing postings format to:" + format); } - return new Lucene42Codec() { + return new Lucene45Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return format; @@ -721,7 +721,7 @@ public class _TestUtil { if (LuceneTestCase.VERBOSE) { System.out.println("forcing docvalues format to:" + format); } - return new Lucene42Codec() { + return new Lucene45Codec() { @Override public DocValuesFormat getDocValuesFormatForField(String field) { return format; @@ -757,9 +757,12 @@ public class _TestUtil { } } + // nocommit: remove this, push this test to Lucene40/Lucene42 codec tests public static boolean fieldSupportsHugeBinaryDocValues(String field) { String dvFormat = getDocValuesFormat(field); - return dvFormat.equals("CheapBastard") || + System.out.println(dvFormat); + return dvFormat.equals("Lucene45") || + dvFormat.equals("Asserting") || dvFormat.equals("Disk") || dvFormat.equals("SimpleText"); } diff --git a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat index 2eeda2459c7..d1798334486 100644 --- a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat +++ b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -14,4 +14,3 @@ # limitations under the License. org.apache.lucene.codecs.asserting.AssertingDocValuesFormat -org.apache.lucene.codecs.cheapbastard.CheapBastardDocValuesFormat \ No newline at end of file diff --git a/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java b/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java index 52d48280945..49bca112fc2 100644 --- a/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java +++ b/solr/core/src/java/org/apache/solr/core/SchemaCodecFactory.java @@ -3,7 +3,7 @@ package org.apache.solr.core; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.solr.common.util.NamedList; import org.apache.solr.schema.SchemaField; import org.apache.solr.util.plugin.SolrCoreAware; @@ -51,7 +51,7 @@ public class SchemaCodecFactory extends CodecFactory implements SolrCoreAware { @Override public void init(NamedList args) { super.init(args); - codec = new Lucene42Codec() { + codec = new Lucene45Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { final SchemaField fieldOrNull = core.getLatestSchema().getFieldOrNull(field); diff --git a/solr/core/src/java/org/apache/solr/request/NumericFacets.java b/solr/core/src/java/org/apache/solr/request/NumericFacets.java index 96796c996be..e16e6358ca4 100644 --- a/solr/core/src/java/org/apache/solr/request/NumericFacets.java +++ b/solr/core/src/java/org/apache/solr/request/NumericFacets.java @@ -190,8 +190,9 @@ final class NumericFacets { } docsWithField = FieldCache.DEFAULT.getDocsWithField(ctx.reader(), fieldName); } - if (docsWithField.get(doc - ctx.docBase)) { - hashTable.add(doc, longs.get(doc - ctx.docBase), 1); + long v = longs.get(doc - ctx.docBase); + if (v != 0 || docsWithField.get(doc - ctx.docBase)) { + hashTable.add(doc, v, 1); } else { ++missingCount; } diff --git a/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml b/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml index 15074809892..73593829cf9 100644 --- a/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema_codec.xml @@ -22,7 +22,7 @@ - + diff --git a/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java b/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java index 0f06d44e2fa..c970652921b 100644 --- a/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java +++ b/solr/core/src/test/org/apache/solr/core/TestCodecSupport.java @@ -55,10 +55,10 @@ public class TestCodecSupport extends SolrTestCaseJ4 { PerFieldDocValuesFormat format = (PerFieldDocValuesFormat) codec.docValuesFormat(); assertEquals("Disk", format.getDocValuesFormatForField(schemaField.getName()).getName()); schemaField = fields.get("string_memory_f"); - assertEquals("Lucene42", + assertEquals("Lucene45", format.getDocValuesFormatForField(schemaField.getName()).getName()); schemaField = fields.get("string_f"); - assertEquals("Lucene42", + assertEquals("Lucene45", format.getDocValuesFormatForField(schemaField.getName()).getName()); } @@ -80,8 +80,8 @@ public class TestCodecSupport extends SolrTestCaseJ4 { assertEquals("Disk", format.getDocValuesFormatForField("foo_disk").getName()); assertEquals("Disk", format.getDocValuesFormatForField("bar_disk").getName()); - assertEquals("Lucene42", format.getDocValuesFormatForField("foo_memory").getName()); - assertEquals("Lucene42", format.getDocValuesFormatForField("bar_memory").getName()); + assertEquals("Lucene45", format.getDocValuesFormatForField("foo_memory").getName()); + assertEquals("Lucene45", format.getDocValuesFormatForField("bar_memory").getName()); } public void testUnknownField() { From 642ac287a24236efc04f109b305df569ebc9cdfc Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 19 Aug 2013 13:49:03 +0000 Subject: [PATCH 04/16] make 4.2 read only / setup impersonator git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515416 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/codecs/lucene42/Lucene42Codec.java | 15 +- .../lucene42/Lucene42DocValuesFormat.java | 15 +- .../lucene42/Lucene42DocValuesProducer.java | 37 ++-- .../lucene42/Lucene42NormsConsumer.java | 209 ++++++++++++++++++ .../codecs/lucene42/Lucene42NormsFormat.java | 4 +- .../lucene42/TestLucene42DocValuesFormat.java | 2 +- .../index/TestBackwardsCompatibility.java | 2 +- .../Facet45Codec.java} | 18 +- .../lucene/facet/codecs/facet45/package.html | 22 ++ .../services/org.apache.lucene.codecs.Codec | 2 +- .../apache/lucene/facet/FacetTestCase.java | 4 +- .../lucene/facet/search/TestDemoFacets.java | 4 +- .../compressing/FastCompressingCodec.java | 9 +- .../FastDecompressionCompressingCodec.java | 8 - .../HighCompressionCompressingCodec.java | 1 - .../lucene42/Lucene42DocValuesConsumer.java | 25 +-- .../codecs/lucene42/Lucene42RWCodec.java | 39 ++++ .../lucene42/Lucene42RWDocValuesFormat.java | 35 +++ .../lucene/codecs/lucene42/package.html | 25 +++ .../index/BaseDocValuesFormatTestCase.java | 1 + .../util/TestRuleSetupAndRestoreClassEnv.java | 9 +- .../services/org.apache.lucene.codecs.Codec | 1 + .../org.apache.lucene.codecs.DocValuesFormat | 1 + 23 files changed, 414 insertions(+), 74 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsConsumer.java rename lucene/facet/src/java/org/apache/lucene/facet/codecs/{facet42/Facet42Codec.java => facet45/Facet45Codec.java} (85%) create mode 100644 lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/package.html rename lucene/{core => test-framework}/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java (94%) create mode 100644 lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java create mode 100644 lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWDocValuesFormat.java create mode 100644 lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/package.html diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java index 4ec3bd833a4..0ce97ba76e6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42Codec.java @@ -17,7 +17,10 @@ package org.apache.lucene.codecs.lucene42; * limitations under the License. */ +import java.io.IOException; + import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.FilterCodec; @@ -32,6 +35,7 @@ import org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat; import org.apache.lucene.codecs.lucene41.Lucene41StoredFieldsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; +import org.apache.lucene.index.SegmentWriteState; /** * Implements the Lucene 4.2 index format, with configurable per-field postings @@ -42,10 +46,12 @@ import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; * * @see org.apache.lucene.codecs.lucene42 package documentation for file format details. * @lucene.experimental + * @deprecated Only for reading old 4.2 segments */ // NOTE: if we make largish changes in a minor release, easier to just make Lucene43Codec or whatever // if they are backwards compatible or smallish we can probably do the backwards in the postingsreader // (it writes a minor version, etc). +@Deprecated public class Lucene42Codec extends Codec { private final StoredFieldsFormat fieldsFormat = new Lucene41StoredFieldsFormat(); private final TermVectorsFormat vectorsFormat = new Lucene42TermVectorsFormat(); @@ -129,10 +135,15 @@ public class Lucene42Codec extends Codec { private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41"); private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene42"); - private final NormsFormat normsFormat = new Lucene42NormsFormat(); + private final NormsFormat normsFormat = new Lucene42NormsFormat() { + @Override + public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException { + throw new UnsupportedOperationException("this codec can only be used for reading"); + } + }; @Override - public final NormsFormat normsFormat() { + public NormsFormat normsFormat() { return normsFormat; } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java index 55bf8097561..00f18606a49 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java @@ -123,8 +123,10 @@ import org.apache.lucene.util.packed.BlockPackedWriter; *

      *
    • Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length. *
    + * @deprecated Only for reading old 4.2 segments */ -public final class Lucene42DocValuesFormat extends DocValuesFormat { +@Deprecated +public class Lucene42DocValuesFormat extends DocValuesFormat { /** Maximum length for each binary doc values field. */ public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2; @@ -154,8 +156,7 @@ public final class Lucene42DocValuesFormat extends DocValuesFormat { @Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - // note: we choose DEFAULT here (its reasonably fast, and for small bpv has tiny waste) - return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); + throw new UnsupportedOperationException("this codec can only be used for reading"); } @Override @@ -163,8 +164,8 @@ public final class Lucene42DocValuesFormat extends DocValuesFormat { return new Lucene42DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); } - private static final String DATA_CODEC = "Lucene42DocValuesData"; - private static final String DATA_EXTENSION = "dvd"; - private static final String METADATA_CODEC = "Lucene42DocValuesMetadata"; - private static final String METADATA_EXTENSION = "dvm"; + static final String DATA_CODEC = "Lucene42DocValuesData"; + static final String DATA_EXTENSION = "dvd"; + static final String METADATA_CODEC = "Lucene42DocValuesMetadata"; + static final String METADATA_EXTENSION = "dvm"; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java index c2a95fb8195..7b111c53e5f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java @@ -17,11 +17,6 @@ package org.apache.lucene.codecs.lucene42; * limitations under the License. */ -import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.DELTA_COMPRESSED; -import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.GCD_COMPRESSED; -import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.TABLE_COMPRESSED; -import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesConsumer.UNCOMPRESSED; - import java.io.IOException; import java.util.Comparator; import java.util.HashMap; @@ -78,6 +73,22 @@ class Lucene42DocValuesProducer extends DocValuesProducer { new HashMap>(); private final int maxDoc; + + + static final byte NUMBER = 0; + static final byte BYTES = 1; + static final byte FST = 2; + + static final int BLOCK_SIZE = 4096; + + static final byte DELTA_COMPRESSED = 0; + static final byte TABLE_COMPRESSED = 1; + static final byte UNCOMPRESSED = 2; + static final byte GCD_COMPRESSED = 3; + + static final int VERSION_START = 0; + static final int VERSION_GCD_COMPRESSION = 1; + static final int VERSION_CURRENT = VERSION_GCD_COMPRESSION; Lucene42DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { maxDoc = state.segmentInfo.getDocCount(); @@ -88,8 +99,8 @@ class Lucene42DocValuesProducer extends DocValuesProducer { final int version; try { version = CodecUtil.checkHeader(in, metaCodec, - Lucene42DocValuesConsumer.VERSION_START, - Lucene42DocValuesConsumer.VERSION_CURRENT); + VERSION_START, + VERSION_CURRENT); numerics = new HashMap(); binaries = new HashMap(); fsts = new HashMap(); @@ -109,8 +120,8 @@ class Lucene42DocValuesProducer extends DocValuesProducer { String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); data = state.directory.openInput(dataName, state.context); final int version2 = CodecUtil.checkHeader(data, dataCodec, - Lucene42DocValuesConsumer.VERSION_START, - Lucene42DocValuesConsumer.VERSION_CURRENT); + VERSION_START, + VERSION_CURRENT); if (version != version2) { throw new CorruptIndexException("Format versions mismatch"); } @@ -127,7 +138,7 @@ class Lucene42DocValuesProducer extends DocValuesProducer { int fieldNumber = meta.readVInt(); while (fieldNumber != -1) { int fieldType = meta.readByte(); - if (fieldType == Lucene42DocValuesConsumer.NUMBER) { + if (fieldType == NUMBER) { NumericEntry entry = new NumericEntry(); entry.offset = meta.readLong(); entry.format = meta.readByte(); @@ -140,11 +151,11 @@ class Lucene42DocValuesProducer extends DocValuesProducer { default: throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); } - if (entry.format != Lucene42DocValuesConsumer.UNCOMPRESSED) { + if (entry.format != UNCOMPRESSED) { entry.packedIntsVersion = meta.readVInt(); } numerics.put(fieldNumber, entry); - } else if (fieldType == Lucene42DocValuesConsumer.BYTES) { + } else if (fieldType == BYTES) { BinaryEntry entry = new BinaryEntry(); entry.offset = meta.readLong(); entry.numBytes = meta.readLong(); @@ -155,7 +166,7 @@ class Lucene42DocValuesProducer extends DocValuesProducer { entry.blockSize = meta.readVInt(); } binaries.put(fieldNumber, entry); - } else if (fieldType == Lucene42DocValuesConsumer.FST) { + } else if (fieldType == FST) { FSTEntry entry = new FSTEntry(); entry.offset = meta.readLong(); entry.numOrds = meta.readVLong(); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsConsumer.java new file mode 100644 index 00000000000..797dd807992 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsConsumer.java @@ -0,0 +1,209 @@ +package org.apache.lucene.codecs.lucene42; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.MathUtil; +import org.apache.lucene.util.packed.BlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts.FormatAndBits; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Writer for {@link Lucene42NormsFormat} + */ +class Lucene42NormsConsumer extends DocValuesConsumer { + static final int VERSION_START = 0; + static final int VERSION_GCD_COMPRESSION = 1; + static final int VERSION_CURRENT = VERSION_GCD_COMPRESSION; + + static final byte NUMBER = 0; + + static final int BLOCK_SIZE = 4096; + + static final byte DELTA_COMPRESSED = 0; + static final byte TABLE_COMPRESSED = 1; + static final byte UNCOMPRESSED = 2; + static final byte GCD_COMPRESSED = 3; + + final IndexOutput data, meta; + final int maxDoc; + final float acceptableOverheadRatio; + + Lucene42NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension, float acceptableOverheadRatio) throws IOException { + this.acceptableOverheadRatio = acceptableOverheadRatio; + maxDoc = state.segmentInfo.getDocCount(); + boolean success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.createOutput(dataName, state.context); + CodecUtil.writeHeader(data, dataCodec, VERSION_CURRENT); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + meta = state.directory.createOutput(metaName, state.context); + CodecUtil.writeHeader(meta, metaCodec, VERSION_CURRENT); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void addNumericField(FieldInfo field, Iterable values) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(NUMBER); + meta.writeLong(data.getFilePointer()); + long minValue = Long.MAX_VALUE; + long maxValue = Long.MIN_VALUE; + long gcd = 0; + // TODO: more efficient? + HashSet uniqueValues = null; + if (true) { + uniqueValues = new HashSet<>(); + + long count = 0; + for (Number nv : values) { + assert nv != null; + final long v = nv.longValue(); + + if (gcd != 1) { + if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { + // in that case v - minValue might overflow and make the GCD computation return + // wrong results. Since these extreme values are unlikely, we just discard + // GCD computation for them + gcd = 1; + } else if (count != 0) { // minValue needs to be set first + gcd = MathUtil.gcd(gcd, v - minValue); + } + } + + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + + if (uniqueValues != null) { + if (uniqueValues.add(v)) { + if (uniqueValues.size() > 256) { + uniqueValues = null; + } + } + } + + ++count; + } + assert count == maxDoc; + } + + if (uniqueValues != null) { + // small number of unique values + final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1); + FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); + if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) { + meta.writeByte(UNCOMPRESSED); // uncompressed + for (Number nv : values) { + data.writeByte(nv == null ? 0 : (byte) nv.longValue()); + } + } else { + meta.writeByte(TABLE_COMPRESSED); // table-compressed + Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]); + final HashMap encode = new HashMap(); + data.writeVInt(decode.length); + for (int i = 0; i < decode.length; i++) { + data.writeLong(decode[i]); + encode.put(decode[i], i); + } + + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeVInt(formatAndBits.format.getId()); + data.writeVInt(formatAndBits.bitsPerValue); + + final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); + for(Number nv : values) { + writer.add(encode.get(nv == null ? 0 : nv.longValue())); + } + writer.finish(); + } + } else if (gcd != 0 && gcd != 1) { + meta.writeByte(GCD_COMPRESSED); + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeLong(minValue); + data.writeLong(gcd); + data.writeVInt(BLOCK_SIZE); + + final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); + for (Number nv : values) { + long value = nv == null ? 0 : nv.longValue(); + writer.add((value - minValue) / gcd); + } + writer.finish(); + } else { + meta.writeByte(DELTA_COMPRESSED); // delta-compressed + + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeVInt(BLOCK_SIZE); + + final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); + for (Number nv : values) { + writer.add(nv == null ? 0 : nv.longValue()); + } + writer.finish(); + } + } + + @Override + public void close() throws IOException { + boolean success = false; + try { + if (meta != null) { + meta.writeVInt(-1); // write EOF marker + } + success = true; + } finally { + if (success) { + IOUtils.close(data, meta); + } else { + IOUtils.closeWhileHandlingException(data, meta); + } + } + } + + @Override + public void addBinaryField(FieldInfo field, final Iterable values) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void addSortedSetField(FieldInfo field, Iterable values, final Iterable docToOrdCount, final Iterable ords) throws IOException { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java index a7c8c1a2aa7..a4571ec6237 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42NormsFormat.java @@ -41,7 +41,7 @@ import org.apache.lucene.util.packed.PackedInts; * * @see Lucene42DocValuesFormat */ -public final class Lucene42NormsFormat extends NormsFormat { +public class Lucene42NormsFormat extends NormsFormat { final float acceptableOverheadRatio; /** @@ -67,7 +67,7 @@ public final class Lucene42NormsFormat extends NormsFormat { @Override public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException { - return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); + return new Lucene42NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java index d86002eb0af..e8885927644 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java @@ -24,7 +24,7 @@ import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase; * Tests Lucene42DocValuesFormat */ public class TestLucene42DocValuesFormat extends BaseCompressingDocValuesFormatTestCase { - private final Codec codec = new Lucene42Codec(); + private final Codec codec = new Lucene42RWCodec(); @Override protected Codec getCodec() { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index ca851727ecf..8d9529a35bc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -74,7 +74,7 @@ import org.junit.Ignore; // we won't even be running the actual code, only the impostor // @SuppressCodecs("Lucene4x") // Sep codec cannot yet handle the offsets in our 4.x index! -@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom", "Lucene40", "Lucene41"}) +@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom", "Lucene40", "Lucene41", "Lucene42"}) public class TestBackwardsCompatibility extends LuceneTestCase { // Uncomment these cases & run them on an older Lucene version, diff --git a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42Codec.java b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/Facet45Codec.java similarity index 85% rename from lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42Codec.java rename to lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/Facet45Codec.java index f8984770a41..9ff508429dc 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/Facet42Codec.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/Facet45Codec.java @@ -1,4 +1,4 @@ -package org.apache.lucene.facet.codecs.facet42; +package org.apache.lucene.facet.codecs.facet45; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,12 +21,13 @@ import java.util.HashSet; import java.util.Set; import org.apache.lucene.codecs.DocValuesFormat; -import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene45.Lucene45Codec; +import org.apache.lucene.facet.codecs.facet42.Facet42DocValuesFormat; import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.FacetIndexingParams; /** - * Same as {@link Lucene42Codec} except it uses {@link Facet42DocValuesFormat} + * Same as {@link Lucene45Codec} except it uses {@link Facet42DocValuesFormat} * for facet fields (faster-but-more-RAM-consuming doc values). * *

    @@ -42,16 +43,14 @@ import org.apache.lucene.facet.params.FacetIndexingParams; * * @lucene.experimental */ -// nocommit -public class Facet42Codec extends Lucene42Codec { +public class Facet45Codec extends Lucene45Codec { private final Set facetFields; private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName("Facet42"); - private final DocValuesFormat lucene42DVFormat = DocValuesFormat.forName("Lucene42"); // must have that for SPI purposes /** Default constructor, uses {@link FacetIndexingParams#DEFAULT}. */ - public Facet42Codec() { + public Facet45Codec() { this(FacetIndexingParams.DEFAULT); } @@ -60,7 +59,7 @@ public class Facet42Codec extends Lucene42Codec { * {@link DocValuesFormat} for the fields that are returned by * {@link FacetIndexingParams#getAllCategoryListParams()}. */ - public Facet42Codec(FacetIndexingParams fip) { + public Facet45Codec(FacetIndexingParams fip) { if (fip.getPartitionSize() != Integer.MAX_VALUE) { throw new IllegalArgumentException("this Codec does not support partitions"); } @@ -75,8 +74,7 @@ public class Facet42Codec extends Lucene42Codec { if (facetFields.contains(field)) { return facetsDVFormat; } else { - return lucene42DVFormat; + return super.getDocValuesFormatForField(field); } } - } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/package.html b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/package.html new file mode 100644 index 00000000000..c752b963484 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet45/package.html @@ -0,0 +1,22 @@ + + + + +Codec + DocValuesFormat that are optimized for facets. + + diff --git a/lucene/facet/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/facet/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 343b4cdb07e..d6e8c740686 100644 --- a/lucene/facet/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/facet/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.facet.codecs.facet42.Facet42Codec +org.apache.lucene.facet.codecs.facet45.Facet45Codec diff --git a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java index 31e79edbf14..6bf9fe6e009 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java @@ -3,7 +3,7 @@ package org.apache.lucene.facet; import java.util.Random; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.facet.codecs.facet42.Facet42Codec; +import org.apache.lucene.facet.codecs.facet45.Facet45Codec; import org.apache.lucene.facet.encoding.DGapIntEncoder; import org.apache.lucene.facet.encoding.DGapVInt8IntEncoder; import org.apache.lucene.facet.encoding.EightFlagsIntEncoder; @@ -53,7 +53,7 @@ public abstract class FacetTestCase extends LuceneTestCase { public static void beforeClassFacetTestCase() throws Exception { if (random().nextDouble() < 0.3) { savedDefault = Codec.getDefault(); // save to restore later - Codec.setDefault(new Facet42Codec()); + Codec.setDefault(new Facet45Codec()); } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java index 0cf73fcdb23..d0e65772160 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java @@ -31,7 +31,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestUtils; -import org.apache.lucene.facet.codecs.facet42.Facet42Codec; +import org.apache.lucene.facet.codecs.facet45.Facet45Codec; import org.apache.lucene.facet.index.FacetFields; import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.FacetIndexingParams; @@ -260,7 +260,7 @@ public class TestDemoFacets extends FacetTestCase { Directory dir = newDirectory(); Directory taxoDir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); - iwc.setCodec(new Facet42Codec()); + iwc.setCodec(new Facet45Codec()); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java index 8b2ed06e9e3..f973648aaeb 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastCompressingCodec.java @@ -1,9 +1,7 @@ package org.apache.lucene.codecs.compressing; -import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.NormsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; // nocommit -import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; // nocommit +import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; import org.apache.lucene.util.packed.PackedInts; /* @@ -42,9 +40,4 @@ public class FastCompressingCodec extends CompressingCodec { public NormsFormat normsFormat() { return new Lucene42NormsFormat(PackedInts.FAST); } - - @Override - public DocValuesFormat docValuesFormat() { - return new Lucene42DocValuesFormat(PackedInts.FAST); - } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java index 5b6385e0d36..d7c0451bcba 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/FastDecompressionCompressingCodec.java @@ -1,12 +1,9 @@ package org.apache.lucene.codecs.compressing; -import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.NormsFormat; -import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; import org.apache.lucene.util.packed.PackedInts; -// nocommit /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -43,9 +40,4 @@ public class FastDecompressionCompressingCodec extends CompressingCodec { public NormsFormat normsFormat() { return new Lucene42NormsFormat(PackedInts.DEFAULT); } - - @Override - public DocValuesFormat docValuesFormat() { - return new Lucene42DocValuesFormat(PackedInts.DEFAULT); - } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java index 6b4b1091b97..2f1fc293592 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/compressing/HighCompressionCompressingCodec.java @@ -21,7 +21,6 @@ import org.apache.lucene.util.packed.PackedInts; * limitations under the License. */ -// nocommit /** CompressionCodec that uses {@link CompressionMode#HIGH_COMPRESSION} */ public class HighCompressionCompressingCodec extends CompressingCodec { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java similarity index 94% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java rename to lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java index 0a2f92f22a8..ac8aeafd33c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java @@ -46,25 +46,20 @@ import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; import org.apache.lucene.util.packed.PackedInts.FormatAndBits; import org.apache.lucene.util.packed.PackedInts; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.BYTES; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.NUMBER; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.FST; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.DELTA_COMPRESSED; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.GCD_COMPRESSED; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.TABLE_COMPRESSED; +import static org.apache.lucene.codecs.lucene42.Lucene42DocValuesProducer.UNCOMPRESSED; + /** * Writer for {@link Lucene42DocValuesFormat} */ class Lucene42DocValuesConsumer extends DocValuesConsumer { - static final int VERSION_START = 0; - static final int VERSION_GCD_COMPRESSION = 1; - static final int VERSION_CURRENT = VERSION_GCD_COMPRESSION; - - static final byte NUMBER = 0; - static final byte BYTES = 1; - static final byte FST = 2; - - static final int BLOCK_SIZE = 4096; - - static final byte DELTA_COMPRESSED = 0; - static final byte TABLE_COMPRESSED = 1; - static final byte UNCOMPRESSED = 2; - static final byte GCD_COMPRESSED = 3; - final IndexOutput data, meta; final int maxDoc; final float acceptableOverheadRatio; diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java new file mode 100644 index 00000000000..5ef50ea522f --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWCodec.java @@ -0,0 +1,39 @@ +package org.apache.lucene.codecs.lucene42; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.NormsFormat; + +/** + * Read-write version of {@link Lucene42Codec} for testing. + */ +public class Lucene42RWCodec extends Lucene42Codec { + private static final DocValuesFormat dv = new Lucene42RWDocValuesFormat(); + private static final NormsFormat norms = new Lucene42NormsFormat(); + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return dv; + } + + @Override + public NormsFormat normsFormat() { + return norms; + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWDocValuesFormat.java new file mode 100644 index 00000000000..45372246184 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/Lucene42RWDocValuesFormat.java @@ -0,0 +1,35 @@ +package org.apache.lucene.codecs.lucene42; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.index.SegmentWriteState; + +/** + * Read-write version of {@link Lucene42DocValuesFormat} for testing. + */ +public class Lucene42RWDocValuesFormat extends Lucene42DocValuesFormat { + + @Override + public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + // note: we choose DEFAULT here (its reasonably fast, and for small bpv has tiny waste) + return new Lucene42DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/package.html b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/package.html new file mode 100644 index 00000000000..f1c62d1e049 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene42/package.html @@ -0,0 +1,25 @@ + + + + + + + +Support for testing {@link org.apache.lucene.codecs.lucene42.Lucene42Codec}. + + \ No newline at end of file diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index 008eeea7d32..c6a0bf393cc 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -2498,6 +2498,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { d.close(); } + // nocommit: get this out of here and into the deprecated codecs (4.0, 4.2) public void testHugeBinaryValueLimit() throws Exception { // We only test DVFormats that have a limit assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field")); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java index 1ab8c818b7f..54159212fab 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java @@ -40,6 +40,7 @@ import org.apache.lucene.codecs.lucene40.Lucene40RWCodec; import org.apache.lucene.codecs.lucene40.Lucene40RWPostingsFormat; import org.apache.lucene.codecs.lucene41.Lucene41RWCodec; import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.codecs.lucene42.Lucene42RWCodec; import org.apache.lucene.codecs.lucene45.Lucene45Codec; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.index.RandomCodec; @@ -147,7 +148,6 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { savedCodec = Codec.getDefault(); int randomVal = random.nextInt(10); - // nocommit: 4.2 impersonator if ("Lucene40".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && "random".equals(TEST_POSTINGSFORMAT) && "random".equals(TEST_DOCVALUESFORMAT) && @@ -163,6 +163,13 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { !shouldAvoidCodec("Lucene41"))) { codec = Codec.forName("Lucene41"); assert codec instanceof Lucene41RWCodec : "fix your classpath to have tests-framework.jar before lucene-core.jar"; + } else if ("Lucene42".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && + "random".equals(TEST_POSTINGSFORMAT) && + "random".equals(TEST_DOCVALUESFORMAT) && + randomVal == 2 && + !shouldAvoidCodec("Lucene42"))) { + codec = Codec.forName("Lucene42"); + assert codec instanceof Lucene42RWCodec : "fix your classpath to have tests-framework.jar before lucene-core.jar"; } else if (("random".equals(TEST_POSTINGSFORMAT) == false) || ("random".equals(TEST_DOCVALUESFORMAT) == false)) { // the user wired postings or DV: this is messy // refactor into RandomCodec.... diff --git a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 9bd5ca87fd7..554bb0fbe07 100644 --- a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -21,3 +21,4 @@ org.apache.lucene.codecs.compressing.HighCompressionCompressingCodec org.apache.lucene.codecs.compressing.dummy.DummyCompressingCodec org.apache.lucene.codecs.lucene40.Lucene40RWCodec org.apache.lucene.codecs.lucene41.Lucene41RWCodec +org.apache.lucene.codecs.lucene42.Lucene42RWCodec diff --git a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat index d1798334486..2086be1f0e7 100644 --- a/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat +++ b/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -14,3 +14,4 @@ # limitations under the License. org.apache.lucene.codecs.asserting.AssertingDocValuesFormat +org.apache.lucene.codecs.lucene42.Lucene42RWDocValuesFormat From 98522b2262eccebc56daed45cc3245461962e1c2 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 19 Aug 2013 16:28:03 +0000 Subject: [PATCH 05/16] support missing for 4.5 and disk dv git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515496 13f79535-47bb-0310-9956-ffa450edef68 --- .../codecs/diskdv/DiskDocValuesProducer.java | 413 +----------------- .../simpletext/SimpleTextDocValuesReader.java | 8 +- .../lucene45/Lucene45DocValuesConsumer.java | 83 ++-- .../lucene45/Lucene45DocValuesProducer.java | 127 ++++-- .../apache/lucene/index/SortedDocValues.java | 2 +- .../apache/lucene/search/TestFieldCache.java | 6 +- .../lucene/index/AssertingAtomicReader.java | 4 +- .../index/BaseDocValuesFormatTestCase.java | 5 +- .../apache/lucene/util/LuceneTestCase.java | 7 +- 9 files changed, 167 insertions(+), 488 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java index 41d2e87b9fe..4972706ed67 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java @@ -17,427 +17,34 @@ package org.apache.lucene.codecs.diskdv; * limitations under the License. */ -import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.DELTA_COMPRESSED; -import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.GCD_COMPRESSED; -import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.TABLE_COMPRESSED; - import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer; -import org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat; -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.codecs.lucene45.Lucene45DocValuesProducer; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SortedDocValues; -import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.packed.BlockPackedReader; import org.apache.lucene.util.packed.MonotonicBlockPackedReader; -import org.apache.lucene.util.packed.PackedInts; -class DiskDocValuesProducer extends DocValuesProducer { - private final Map numerics; - private final Map ords; - private final Map ordIndexes; - private final Map binaries; - private final IndexInput data; - private final int maxDoc; - +class DiskDocValuesProducer extends Lucene45DocValuesProducer { + DiskDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { - String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); - this.maxDoc = state.segmentInfo.getDocCount(); - // read in the entries from the metadata file. - IndexInput in = state.directory.openInput(metaName, state.context); - boolean success = false; - final int version; - try { - version = CodecUtil.checkHeader(in, metaCodec, - Lucene45DocValuesFormat.VERSION_CURRENT, - Lucene45DocValuesFormat.VERSION_CURRENT); - numerics = new HashMap(); - ords = new HashMap(); - ordIndexes = new HashMap(); - binaries = new HashMap(); - readFields(in); - - success = true; - } finally { - if (success) { - IOUtils.close(in); - } else { - IOUtils.closeWhileHandlingException(in); - } - } - - success = false; - try { - String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); - data = state.directory.openInput(dataName, state.context); - final int version2 = CodecUtil.checkHeader(data, dataCodec, - Lucene45DocValuesFormat.VERSION_CURRENT, - Lucene45DocValuesFormat.VERSION_CURRENT); - if (version != version2) { - throw new CorruptIndexException("Versions mismatch"); - } - - success = true; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(this.data); - } - } - - } - - private void readFields(IndexInput meta) throws IOException { - int fieldNumber = meta.readVInt(); - while (fieldNumber != -1) { - byte type = meta.readByte(); - if (type == Lucene45DocValuesFormat.NUMERIC) { - numerics.put(fieldNumber, readNumericEntry(meta)); - } else if (type == Lucene45DocValuesFormat.BINARY) { - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - } else if (type == Lucene45DocValuesFormat.SORTED) { - // sorted = binary + numeric - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != Lucene45DocValuesFormat.BINARY) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt"); - } - NumericEntry n = readNumericEntry(meta); - ords.put(fieldNumber, n); - } else if (type == Lucene45DocValuesFormat.SORTED_SET) { - // sortedset = binary + numeric + ordIndex - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != Lucene45DocValuesFormat.BINARY) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - BinaryEntry b = readBinaryEntry(meta); - binaries.put(fieldNumber, b); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - NumericEntry n1 = readNumericEntry(meta); - ords.put(fieldNumber, n1); - - if (meta.readVInt() != fieldNumber) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { - throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt"); - } - NumericEntry n2 = readNumericEntry(meta); - ordIndexes.put(fieldNumber, n2); - } else { - throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta); - } - fieldNumber = meta.readVInt(); - } - } - - static NumericEntry readNumericEntry(IndexInput meta) throws IOException { - NumericEntry entry = new NumericEntry(); - entry.format = meta.readVInt(); - entry.packedIntsVersion = meta.readVInt(); - entry.offset = meta.readLong(); - entry.count = meta.readVLong(); - entry.blockSize = meta.readVInt(); - switch(entry.format) { - case GCD_COMPRESSED: - entry.minValue = meta.readLong(); - entry.gcd = meta.readLong(); - break; - case TABLE_COMPRESSED: - if (entry.count > Integer.MAX_VALUE) { - throw new CorruptIndexException("Cannot use TABLE_COMPRESSED with more than MAX_VALUE values, input=" + meta); - } - final int uniqueValues = meta.readVInt(); - if (uniqueValues > 256) { - throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + meta); - } - entry.table = new long[uniqueValues]; - for (int i = 0; i < uniqueValues; ++i) { - entry.table[i] = meta.readLong(); - } - break; - case DELTA_COMPRESSED: - break; - default: - throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); - } - return entry; - } - - static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException { - BinaryEntry entry = new BinaryEntry(); - int format = meta.readVInt(); - if (format != Lucene45DocValuesConsumer.BINARY_FIXED_UNCOMPRESSED && format != Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED) { - throw new CorruptIndexException("Unexpected format for binary entry: " + format + ", input=" + meta); - } - entry.minLength = meta.readVInt(); - entry.maxLength = meta.readVInt(); - entry.count = meta.readVLong(); - entry.offset = meta.readLong(); - if (entry.minLength != entry.maxLength) { - entry.addressesOffset = meta.readLong(); - entry.packedIntsVersion = meta.readVInt(); - entry.blockSize = meta.readVInt(); - } - return entry; + super(state, dataCodec, dataExtension, metaCodec, metaExtension); } @Override - public NumericDocValues getNumeric(FieldInfo field) throws IOException { - NumericEntry entry = numerics.get(field.number); - return getNumeric(field, entry); - } - - private LongNumericDocValues getNumeric(FieldInfo field, final NumericEntry entry) throws IOException { - final IndexInput data = this.data.clone(); - data.seek(entry.offset); - - switch (entry.format) { - case DELTA_COMPRESSED: - final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - return new LongNumericDocValues() { - @Override - public long get(long id) { - return reader.get(id); - } - }; - case GCD_COMPRESSED: - final long min = entry.minValue; - final long mult = entry.gcd; - final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - return new LongNumericDocValues() { - @Override - public long get(long id) { - return min + mult * quotientReader.get(id); - } - }; - case TABLE_COMPRESSED: - final long[] table = entry.table; - final int bitsRequired = PackedInts.bitsRequired(table.length - 1); - final PackedInts.Reader ords = PackedInts.getDirectReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, (int) entry.count, bitsRequired); - return new LongNumericDocValues() { - @Override - long get(long id) { - return table[(int) ords.get((int) id)]; - } - }; - default: - throw new AssertionError(); - } - } - - @Override - public BinaryDocValues getBinary(FieldInfo field) throws IOException { - BinaryEntry bytes = binaries.get(field.number); - if (bytes.minLength == bytes.maxLength) { - return getFixedBinary(field, bytes); - } else { - return getVariableBinary(field, bytes); - } - } - - private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) { - final IndexInput data = this.data.clone(); - - return new LongBinaryDocValues() { - @Override - public void get(long id, BytesRef result) { - long address = bytes.offset + id * bytes.maxLength; - try { - data.seek(address); - // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) - // assume "they" own the bytes after calling this! - final byte[] buffer = new byte[bytes.maxLength]; - data.readBytes(buffer, 0, buffer.length); - result.bytes = buffer; - result.offset = 0; - result.length = buffer.length; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; - } - - private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final IndexInput data = this.data.clone(); + protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException { data.seek(bytes.addressesOffset); - - final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, true); - return new LongBinaryDocValues() { - @Override - public void get(long id, BytesRef result) { - long startAddress = bytes.offset + (id == 0 ? 0 : + addresses.get(id-1)); - long endAddress = bytes.offset + addresses.get(id); - int length = (int) (endAddress - startAddress); - try { - data.seek(startAddress); - // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) - // assume "they" own the bytes after calling this! - final byte[] buffer = new byte[length]; - data.readBytes(buffer, 0, buffer.length); - result.bytes = buffer; - result.offset = 0; - result.length = length; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - }; + return new MonotonicBlockPackedReader(data.clone(), bytes.packedIntsVersion, bytes.blockSize, bytes.count, true); } @Override - public SortedDocValues getSorted(FieldInfo field) throws IOException { - final int valueCount = (int) binaries.get(field.number).count; - final BinaryDocValues binary = getBinary(field); - final NumericDocValues ordinals = getNumeric(field, ords.get(field.number)); - return new SortedDocValues() { - - @Override - public int getOrd(int docID) { - return (int) ordinals.get(docID); - } - - @Override - public void lookupOrd(int ord, BytesRef result) { - binary.get(ord, result); - } - - @Override - public int getValueCount() { - return valueCount; - } - }; + protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException { + throw new AssertionError(); } @Override - public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - final long valueCount = binaries.get(field.number).count; - final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); - final LongNumericDocValues ordinals = getNumeric(field, ords.get(field.number)); - NumericEntry entry = ordIndexes.get(field.number); - IndexInput data = this.data.clone(); + protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException { data.seek(entry.offset); - final MonotonicBlockPackedReader ordIndex = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true); - - return new SortedSetDocValues() { - long offset; - long endOffset; - - @Override - public long nextOrd() { - if (offset == endOffset) { - return NO_MORE_ORDS; - } else { - long ord = ordinals.get(offset); - offset++; - return ord; - } - } - - @Override - public void setDocument(int docID) { - offset = (docID == 0 ? 0 : ordIndex.get(docID-1)); - endOffset = ordIndex.get(docID); - } - - @Override - public void lookupOrd(long ord, BytesRef result) { - binary.get(ord, result); - } - - @Override - public long getValueCount() { - return valueCount; - } - }; - } - - @Override - public Bits getDocsWithField(FieldInfo field) throws IOException { - if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) { - return new SortedSetDocsWithField(getSortedSet(field), maxDoc); - } else { - return new Bits.MatchAllBits(maxDoc); - } - } - - @Override - public void close() throws IOException { - data.close(); - } - - static class NumericEntry { - long offset; - - int format; - int packedIntsVersion; - long count; - int blockSize; - - long minValue; - long gcd; - long table[]; - } - - static class BinaryEntry { - long offset; - - long count; - int minLength; - int maxLength; - long addressesOffset; - int packedIntsVersion; - int blockSize; - } - - // internally we compose complex dv (sorted/sortedset) from other ones - static abstract class LongNumericDocValues extends NumericDocValues { - @Override - public final long get(int docID) { - return get((long) docID); - } - - abstract long get(long id); - } - - static abstract class LongBinaryDocValues extends BinaryDocValues { - @Override - public final void get(int docID, BytesRef result) { - get((long)docID, result); - } - - abstract void get(long id, BytesRef Result); + return new MonotonicBlockPackedReader(data.clone(), entry.packedIntsVersion, entry.blockSize, entry.count, true); } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java index c625f4ea2e8..9ead984bb84 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java @@ -321,12 +321,8 @@ class SimpleTextDocValuesReader extends DocValuesProducer { @Override public void lookupOrd(int ord, BytesRef result) { try { - if (ord == -1) { - result.length = 0; - return; - } - if (ord < -1 || ord >= field.numValues) { - throw new IndexOutOfBoundsException("ord must be -1 .. " + (field.numValues-1) + "; got " + ord); + if (ord < 0 || ord >= field.numValues) { + throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord); } in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength)); SimpleTextUtil.readLine(in, scratch); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java index 942ee228045..21ee03075f5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java @@ -23,7 +23,6 @@ import java.util.HashSet; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; -import org.apache.lucene.codecs.MissingOrdRemapper; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; @@ -88,14 +87,20 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer { long minValue = Long.MAX_VALUE; long maxValue = Long.MIN_VALUE; long gcd = 0; + boolean missing = false; // TODO: more efficient? HashSet uniqueValues = null; if (optimizeStorage) { uniqueValues = new HashSet<>(); - // nocommit: impl null values (ideally smartly) for (Number nv : values) { - final long v = nv == null ? 0 : nv.longValue(); + final long v; + if (nv == null) { + v = 0; + missing = true; + } else { + v = nv.longValue(); + } if (gcd != 1) { if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { @@ -142,6 +147,12 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer { meta.writeVInt(field.number); meta.writeByte(Lucene45DocValuesFormat.NUMERIC); meta.writeVInt(format); + if (missing) { + meta.writeLong(data.getFilePointer()); + writeMissingBitset(values); + } else { + meta.writeLong(-1L); + } meta.writeVInt(PackedInts.VERSION_CURRENT); meta.writeLong(data.getFilePointer()); meta.writeVLong(count); @@ -184,6 +195,27 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer { throw new AssertionError(); } } + + // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on, + // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode) + void writeMissingBitset(Iterable values) throws IOException { + byte bits = 0; + int count = 0; + for (Object v : values) { + if (count == 8) { + data.writeByte(bits); + count = 0; + bits = 0; + } + if (v != null) { + bits |= 1 << (count & 7); + } + count++; + } + if (count > 0) { + data.writeByte(bits); + } + } @Override public void addBinaryField(FieldInfo field, Iterable values) throws IOException { @@ -194,8 +226,15 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer { int maxLength = Integer.MIN_VALUE; final long startFP = data.getFilePointer(); long count = 0; + boolean missing = false; for(BytesRef v : values) { - final int length = v == null ? 0 : v.length; + final int length; + if (v == null) { + length = 0; + missing = true; + } else { + length = v.length; + } minLength = Math.min(minLength, length); maxLength = Math.max(maxLength, length); if (v != null) { @@ -204,6 +243,12 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer { count++; } meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED); + if (missing) { + meta.writeLong(data.getFilePointer()); + writeMissingBitset(values); + } else { + meta.writeLong(-1L); + } meta.writeVInt(minLength); meta.writeVInt(maxLength); meta.writeVLong(count); @@ -244,6 +289,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer { meta.writeVInt(field.number); meta.writeByte(Lucene45DocValuesFormat.BINARY); meta.writeVInt(BINARY_PREFIX_COMPRESSED); + meta.writeLong(-1L); // now write the bytes: sharing prefixes within a block final long startFP = data.getFilePointer(); // currently, we have to store the delta from expected for every 1/nth term @@ -286,34 +332,6 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer { @Override public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { - // nocommit: remove this hack and support missing! - - // three cases for simulating the old writer: - // 1. no missing - // 2. missing (and empty string in use): remap ord=-1 -> ord=0 - // 3. missing (and empty string not in use): remap all ords +1, insert empty string into values - boolean anyMissing = false; - for (Number n : docToOrd) { - if (n.longValue() == -1) { - anyMissing = true; - break; - } - } - - boolean hasEmptyString = false; - for (BytesRef b : values) { - hasEmptyString = b.length == 0; - break; - } - - if (!anyMissing) { - // nothing to do - } else if (hasEmptyString) { - docToOrd = MissingOrdRemapper.mapMissingToOrd0(docToOrd); - } else { - docToOrd = MissingOrdRemapper.mapAllOrds(docToOrd); - values = MissingOrdRemapper.insertEmptyValue(values); - } meta.writeVInt(field.number); meta.writeByte(Lucene45DocValuesFormat.SORTED); addTermsDict(field, values); @@ -334,6 +352,7 @@ public class Lucene45DocValuesConsumer extends DocValuesConsumer { meta.writeVInt(field.number); meta.writeByte(Lucene45DocValuesFormat.NUMERIC); meta.writeVInt(DELTA_COMPRESSED); + meta.writeLong(-1L); meta.writeVInt(PackedInts.VERSION_CURRENT); meta.writeLong(data.getFilePointer()); meta.writeVLong(maxDoc); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java index b19a34e169c..b1ca3a8cf60 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java @@ -53,7 +53,7 @@ import org.apache.lucene.util.packed.BlockPackedReader; import org.apache.lucene.util.packed.MonotonicBlockPackedReader; import org.apache.lucene.util.packed.PackedInts; -class Lucene45DocValuesProducer extends DocValuesProducer { +public class Lucene45DocValuesProducer extends DocValuesProducer { private final Map numerics; private final Map binaries; private final Map ords; @@ -65,7 +65,7 @@ class Lucene45DocValuesProducer extends DocValuesProducer { private final Map addressInstances = new HashMap(); private final Map ordIndexInstances = new HashMap(); - Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + protected Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); // read in the entries from the metadata file. IndexInput in = state.directory.openInput(metaName, state.context); @@ -176,6 +176,7 @@ class Lucene45DocValuesProducer extends DocValuesProducer { static NumericEntry readNumericEntry(IndexInput meta) throws IOException { NumericEntry entry = new NumericEntry(); entry.format = meta.readVInt(); + entry.missingOffset = meta.readLong(); entry.packedIntsVersion = meta.readVInt(); entry.offset = meta.readLong(); entry.count = meta.readVLong(); @@ -209,6 +210,7 @@ class Lucene45DocValuesProducer extends DocValuesProducer { static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException { BinaryEntry entry = new BinaryEntry(); entry.format = meta.readVInt(); + entry.missingOffset = meta.readLong(); entry.minLength = meta.readVInt(); entry.maxLength = meta.readVInt(); entry.count = meta.readVLong(); @@ -315,9 +317,7 @@ class Lucene45DocValuesProducer extends DocValuesProducer { }; } - private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final IndexInput data = this.data.clone(); - + protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException { final MonotonicBlockPackedReader addresses; synchronized (addressInstances) { MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number); @@ -328,6 +328,13 @@ class Lucene45DocValuesProducer extends DocValuesProducer { } addresses = addrInstance; } + return addresses; + } + + private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final IndexInput data = this.data.clone(); + + final MonotonicBlockPackedReader addresses = getAddressInstance(data, field, bytes); return new LongBinaryDocValues() { @Override @@ -350,12 +357,10 @@ class Lucene45DocValuesProducer extends DocValuesProducer { } }; } - - private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { - final IndexInput data = this.data.clone(); - final long interval = bytes.addressInterval; - + + protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException { final MonotonicBlockPackedReader addresses; + final long interval = bytes.addressInterval; synchronized (addressInstances) { MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number); if (addrInstance == null) { @@ -371,6 +376,14 @@ class Lucene45DocValuesProducer extends DocValuesProducer { } addresses = addrInstance; } + return addresses; + } + + + private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException { + final IndexInput data = this.data.clone(); + + final MonotonicBlockPackedReader addresses = getIntervalInstance(data, field, bytes); return new CompressedBinaryDocValues(bytes, addresses, data); } @@ -420,26 +433,30 @@ class Lucene45DocValuesProducer extends DocValuesProducer { } }; } - - @Override - public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - final long valueCount = binaries.get(field.number).count; - // we keep the byte[]s and list of ords on disk, these could be large - final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); - final LongNumericDocValues ordinals = getNumeric(ords.get(field.number)); - // but the addresses to the ord stream are in RAM + + protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException { final MonotonicBlockPackedReader ordIndex; synchronized (ordIndexInstances) { MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number); if (ordIndexInstance == null) { - NumericEntry entry = ordIndexes.get(field.number); - IndexInput data = this.data.clone(); data.seek(entry.offset); ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false); ordIndexInstances.put(field.number, ordIndexInstance); } ordIndex = ordIndexInstance; } + return ordIndex; + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + final IndexInput data = this.data.clone(); + final long valueCount = binaries.get(field.number).count; + // we keep the byte[]s and list of ords on disk, these could be large + final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); + final LongNumericDocValues ordinals = getNumeric(ords.get(field.number)); + // but the addresses to the ord stream are in RAM + final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(data, field, ordIndexes.get(field.number)); return new SortedSetDocValues() { long offset; @@ -491,15 +508,47 @@ class Lucene45DocValuesProducer extends DocValuesProducer { } }; } + + public Bits getMissingBits(final long offset) throws IOException { + if (offset == -1) { + return new Bits.MatchAllBits(maxDoc); + } else { + final IndexInput in = data.clone(); + return new Bits() { + + @Override + public boolean get(int index) { + try { + in.seek(offset + (index >> 3)); + return (in.readByte() & (1 << (index & 7))) != 0; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public int length() { + return maxDoc; + } + }; + } + } @Override public Bits getDocsWithField(FieldInfo field) throws IOException { - // nocommit: only use this if the field's entry has missing values (write that), - // otherwise return MatchAllBits - if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) { - return new SortedSetDocsWithField(getSortedSet(field), maxDoc); - } else { - return new Bits.MatchAllBits(maxDoc); + switch(field.getDocValuesType()) { + case SORTED_SET: + return new SortedSetDocsWithField(getSortedSet(field), maxDoc); + case SORTED: + return new SortedDocsWithField(getSorted(field), maxDoc); + case BINARY: + BinaryEntry be = binaries.get(field.number); + return getMissingBits(be.missingOffset); + case NUMERIC: + NumericEntry ne = numerics.get(field.number); + return getMissingBits(ne.missingOffset); + default: + throw new AssertionError(); } } @@ -508,30 +557,32 @@ class Lucene45DocValuesProducer extends DocValuesProducer { data.close(); } - static class NumericEntry { - long offset; + protected static class NumericEntry { + long missingOffset; + public long offset; - int format; - int packedIntsVersion; - long count; - int blockSize; + public int format; + public int packedIntsVersion; + public long count; + public int blockSize; long minValue; long gcd; long table[]; } - static class BinaryEntry { + protected static class BinaryEntry { + long missingOffset; long offset; int format; - long count; + public long count; int minLength; int maxLength; - long addressesOffset; - long addressInterval; - int packedIntsVersion; - int blockSize; + public long addressesOffset; + public long addressInterval; + public int packedIntsVersion; + public int blockSize; } // internally we compose complex dv (sorted/sortedset) from other ones diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java index 1968a791157..df36931a253 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java @@ -43,7 +43,7 @@ public abstract class SortedDocValues extends BinaryDocValues { public abstract int getOrd(int docID); /** Retrieves the value for the specified ordinal. - * @param ord ordinal to lookup + * @param ord ordinal to lookup (must be >= 0 and < {@link #getValueCount()}) * @param result will be populated with the ordinal's value * @see #getOrd(int) */ diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java b/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java index da1cf218617..c36367277bb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java @@ -480,7 +480,7 @@ public class TestFieldCache extends LuceneTestCase { } catch (IllegalStateException expected) {} Bits bits = FieldCache.DEFAULT.getDocsWithField(ar, "binary"); - assertTrue(bits instanceof Bits.MatchAllBits); + assertTrue(bits.get(0)); // Sorted type: can be retrieved via getTerms(), getTermsIndex(), getDocTermOrds() try { @@ -510,7 +510,7 @@ public class TestFieldCache extends LuceneTestCase { assertEquals(1, sortedSet.getValueCount()); bits = FieldCache.DEFAULT.getDocsWithField(ar, "sorted"); - assertTrue(bits instanceof Bits.MatchAllBits); + assertTrue(bits.get(0)); // Numeric type: can be retrieved via getInts() and so on Ints numeric = FieldCache.DEFAULT.getInts(ar, "numeric", false); @@ -537,7 +537,7 @@ public class TestFieldCache extends LuceneTestCase { } catch (IllegalStateException expected) {} bits = FieldCache.DEFAULT.getDocsWithField(ar, "numeric"); - assertTrue(bits instanceof Bits.MatchAllBits); + assertTrue(bits.get(0)); // SortedSet type: can be retrieved via getDocTermOrds() if (defaultCodecSupportsSortedSet()) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java index 6db5ccdeea3..eb6b20df5eb 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java @@ -438,14 +438,14 @@ public class AssertingAtomicReader extends FilterAtomicReader { this.in = in; this.maxDoc = maxDoc; this.valueCount = in.getValueCount(); - assert valueCount >= 1 && valueCount <= maxDoc; + assert valueCount >= 0 && valueCount <= maxDoc; } @Override public int getOrd(int docID) { assert docID >= 0 && docID < maxDoc; int ord = in.getOrd(docID); - assert ord >= 0 && ord < valueCount; + assert ord >= -1 && ord < valueCount; return ord; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index c6a0bf393cc..8622c4ad9e1 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -696,7 +696,10 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { BytesRef scratch = new BytesRef(); dv.lookupOrd(dv.getOrd(0), scratch); assertEquals(new BytesRef("hello world 2"), scratch); - dv.lookupOrd(dv.getOrd(1), scratch); + if (codecSupportsDocsWithField("dv")) { + assertEquals(-1, dv.getOrd(1)); + } + dv.get(1, scratch); assertEquals(new BytesRef(""), scratch); ireader.close(); directory.close(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java index f14f772e0df..27f736b8706 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java @@ -1372,8 +1372,11 @@ public abstract class LuceneTestCase extends Assert { /** Returns true if the codec for the field "supports" docsWithField * (other codecs return MatchAllBits, because you couldnt write missing values before) */ public static boolean codecSupportsDocsWithField(String field) { - // currently only one codec! - return _TestUtil.getDocValuesFormat(Codec.getDefault(), field).equals("SimpleText"); + String name = _TestUtil.getDocValuesFormat(Codec.getDefault(), field); + if (name.equals("Lucene40") || name.equals("Lucene42")) { + return false; + } + return true; } public void assertReaderEquals(String info, IndexReader leftReader, IndexReader rightReader) throws IOException { From d208878c116bb66d6cb114ce51f69b7264ab668a Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 19 Aug 2013 17:23:52 +0000 Subject: [PATCH 06/16] improve DV faceting tests, support missing count for single valued string fields, remove required/default restriction git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515520 13f79535-47bb-0310-9956-ffa450edef68 --- .../valuesource/BytesRefFieldSource.java | 3 +- .../index/BaseDocValuesFormatTestCase.java | 37 +++++++++++++++++++ .../apache/solr/request/DocValuesFacets.java | 11 ++---- .../apache/solr/request/NumericFacets.java | 2 +- .../java/org/apache/solr/schema/StrField.java | 3 -- .../org/apache/solr/schema/TrieField.java | 3 -- ...hema-docValues-not-required-no-default.xml | 33 ----------------- .../conf/schema-docValuesFaceting.xml | 9 ++--- .../org/apache/solr/TestRandomDVFaceting.java | 25 +++++++++---- .../solr/schema/BadIndexSchemaTest.java | 4 -- solr/example/solr/collection1/conf/schema.xml | 6 ++- 11 files changed, 69 insertions(+), 67 deletions(-) delete mode 100644 solr/core/src/test-files/solr/collection1/conf/bad-schema-docValues-not-required-no-default.xml diff --git a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/BytesRefFieldSource.java b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/BytesRefFieldSource.java index edbc37c8c76..871c94cc50e 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/BytesRefFieldSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/BytesRefFieldSource.java @@ -45,12 +45,13 @@ public class BytesRefFieldSource extends FieldCacheSource { // To be sorted or not to be sorted, that is the question // TODO: do it cleaner? if (fieldInfo != null && fieldInfo.getDocValuesType() == DocValuesType.BINARY) { + final Bits docsWithField = FieldCache.DEFAULT.getDocsWithField(readerContext.reader(), field); final BinaryDocValues binaryValues = FieldCache.DEFAULT.getTerms(readerContext.reader(), field); return new FunctionValues() { @Override public boolean exists(int doc) { - return true; // doc values are dense + return docsWithField.get(doc); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index 8622c4ad9e1..48e23848aea 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -650,6 +650,43 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { ireader.close(); directory.close(); } + + public void testSortedMergeAwayAllValues() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.NO)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedDocValuesField("field", new BytesRef("hello"))); + iwriter.addDocument(doc); + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedDocValues dv = getOnlySegmentReader(ireader).getSortedDocValues("field"); + if (codecSupportsDocsWithField("field")) { + assertEquals(-1, dv.getOrd(0)); + assertEquals(0, dv.getValueCount()); + } else { + assertEquals(0, dv.getOrd(0)); + assertEquals(1, dv.getValueCount()); + BytesRef ref = new BytesRef(); + dv.lookupOrd(0, ref); + assertEquals(new BytesRef(), ref); + } + + ireader.close(); + directory.close(); + } public void testBytesWithNewline() throws IOException { Analyzer analyzer = new MockAnalyzer(random()); diff --git a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java index bca44a19d76..fb424fdee8b 100644 --- a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java +++ b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java @@ -218,12 +218,7 @@ public class DocValuesFacets { static NamedList finalize(NamedList res, SolrIndexSearcher searcher, SchemaField schemaField, DocSet docs, int missingCount, boolean missing) throws IOException { if (missing) { if (missingCount < 0) { - if (schemaField.multiValued()) { - missingCount = SimpleFacets.getFieldMissingCount(searcher,docs,schemaField.getName()); - } else { - // nocommit: support missing count (ord = -1) for single-valued here. - missingCount = 0; // single-valued dv is implicitly 0 - } + missingCount = SimpleFacets.getFieldMissingCount(searcher,docs,schemaField.getName()); } res.add(null, missingCount); } @@ -232,12 +227,12 @@ public class DocValuesFacets { } /** accumulates per-segment single-valued facet counts, mapping to global ordinal space */ - // specialized since the single-valued case is simpler: you don't have to deal with missing count, etc + // specialized since the single-valued case is different static void accumSingle(int counts[], int startTermIndex, SortedDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException { int doc; while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { int term = si.getOrd(doc); - if (map != null) { + if (map != null && term >= 0) { term = (int) map.getGlobalOrd(subIndex, term); } int arrIdx = term-startTermIndex; diff --git a/solr/core/src/java/org/apache/solr/request/NumericFacets.java b/solr/core/src/java/org/apache/solr/request/NumericFacets.java index e16e6358ca4..62950e2e7ca 100644 --- a/solr/core/src/java/org/apache/solr/request/NumericFacets.java +++ b/solr/core/src/java/org/apache/solr/request/NumericFacets.java @@ -255,7 +255,7 @@ final class NumericFacets { if (zeros && (limit < 0 || result.size() < limit)) { // need to merge with the term dict if (!sf.indexed()) { - throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on a field which is not indexed"); + throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on field " + sf.getName() + " which is not indexed"); } // Add zeros until there are limit results final Set alreadySeen = new HashSet(); diff --git a/solr/core/src/java/org/apache/solr/schema/StrField.java b/solr/core/src/java/org/apache/solr/schema/StrField.java index 7c59741d7d8..2c9600c67de 100644 --- a/solr/core/src/java/org/apache/solr/schema/StrField.java +++ b/solr/core/src/java/org/apache/solr/schema/StrField.java @@ -80,9 +80,6 @@ public class StrField extends PrimitiveFieldType { @Override public void checkSchemaField(SchemaField field) { - if (field.hasDocValues() && !field.multiValued() && !(field.isRequired() || field.getDefaultValue() != null)) { - throw new IllegalStateException("Field " + this + " has single-valued doc values enabled, but has no default value and is not required"); - } } } diff --git a/solr/core/src/java/org/apache/solr/schema/TrieField.java b/solr/core/src/java/org/apache/solr/schema/TrieField.java index 99cff2204b5..d0e92f7e427 100644 --- a/solr/core/src/java/org/apache/solr/schema/TrieField.java +++ b/solr/core/src/java/org/apache/solr/schema/TrieField.java @@ -696,9 +696,6 @@ public class TrieField extends PrimitiveFieldType { @Override public void checkSchemaField(final SchemaField field) { - if (field.hasDocValues() && !field.multiValued() && !(field.isRequired() || field.getDefaultValue() != null)) { - throw new IllegalStateException("Field " + this + " has single-valued doc values enabled, but has no default value and is not required"); - } } } diff --git a/solr/core/src/test-files/solr/collection1/conf/bad-schema-docValues-not-required-no-default.xml b/solr/core/src/test-files/solr/collection1/conf/bad-schema-docValues-not-required-no-default.xml deleted file mode 100644 index deadd9ac68b..00000000000 --- a/solr/core/src/test-files/solr/collection1/conf/bad-schema-docValues-not-required-no-default.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - - - - - - - - - - - - - id - id - - diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml b/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml index e811f91e8a1..0e3116d0797 100755 --- a/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml +++ b/solr/core/src/test-files/solr/collection1/conf/schema-docValuesFaceting.xml @@ -26,17 +26,16 @@ - - + - - + + - + id diff --git a/solr/core/src/test/org/apache/solr/TestRandomDVFaceting.java b/solr/core/src/test/org/apache/solr/TestRandomDVFaceting.java index df36b6b593b..b6581af49bf 100644 --- a/solr/core/src/test/org/apache/solr/TestRandomDVFaceting.java +++ b/solr/core/src/test/org/apache/solr/TestRandomDVFaceting.java @@ -39,7 +39,7 @@ import org.junit.Test; * to the indexed facet results as if it were just another faceting method. */ @Slow -@SuppressCodecs({"Lucene40", "Lucene41"}) +@SuppressCodecs({"Lucene40", "Lucene41", "Lucene42"}) public class TestRandomDVFaceting extends SolrTestCaseJ4 { @BeforeClass @@ -162,6 +162,8 @@ public class TestRandomDVFaceting extends SolrTestCaseJ4 { SchemaField sf = req.getSchema().getField(ftype.fname); boolean multiValued = sf.getType().multiValuedFieldCache(); + boolean indexed = sf.indexed(); + boolean numeric = sf.getType().getNumericType() != null; int offset = 0; if (rand.nextInt(100) < 20) { @@ -179,8 +181,21 @@ public class TestRandomDVFaceting extends SolrTestCaseJ4 { params.add("facet.limit", Integer.toString(limit)); } - if (rand.nextBoolean()) { - params.add("facet.sort", rand.nextBoolean() ? "index" : "count"); + // the following two situations cannot work for unindexed single-valued numerics: + // (currently none of the dv fields in this test config) + // facet.sort = index + // facet.minCount = 0 + if (!numeric || sf.multiValued()) { + if (rand.nextBoolean()) { + params.add("facet.sort", rand.nextBoolean() ? "index" : "count"); + } + + if (rand.nextInt(100) < 10) { + params.add("facet.mincount", Integer.toString(rand.nextInt(5))); + } + } else { + params.add("facet.sort", "count"); + params.add("facet.mincount", Integer.toString(1+rand.nextInt(5))); } if ((ftype.vals instanceof SVal) && rand.nextInt(100) < 20) { @@ -192,10 +207,6 @@ public class TestRandomDVFaceting extends SolrTestCaseJ4 { params.add("facet.prefix", prefix); } - if (rand.nextInt(100) < 10) { - params.add("facet.mincount", Integer.toString(rand.nextInt(5))); - } - if (rand.nextInt(100) < 20) { params.add("facet.missing", "true"); } diff --git a/solr/core/src/test/org/apache/solr/schema/BadIndexSchemaTest.java b/solr/core/src/test/org/apache/solr/schema/BadIndexSchemaTest.java index 1a2693e56fd..a4453ddfbc3 100644 --- a/solr/core/src/test/org/apache/solr/schema/BadIndexSchemaTest.java +++ b/solr/core/src/test/org/apache/solr/schema/BadIndexSchemaTest.java @@ -93,10 +93,6 @@ public class BadIndexSchemaTest extends AbstractBadConfigTestBase { doTest("bad-schema-codec-global-vs-ft-mismatch.xml", "codec does not support"); } - public void testDocValuesNotRequiredNoDefault() throws Exception { - doTest("bad-schema-docValues-not-required-no-default.xml", "has no default value and is not required"); - } - public void testDocValuesUnsupported() throws Exception { doTest("bad-schema-unsupported-docValues.xml", "does not support doc values"); } diff --git a/solr/example/solr/collection1/conf/schema.xml b/solr/example/solr/collection1/conf/schema.xml index 9cdd2976026..75fad489f25 100755 --- a/solr/example/solr/collection1/conf/schema.xml +++ b/solr/example/solr/collection1/conf/schema.xml @@ -168,8 +168,10 @@ + + + + + +Codecs for testing (simulate old disk formats, wacky theoretical use cases, etc) + + diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index 48e23848aea..c1902cd93eb 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -2538,7 +2538,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { d.close(); } - // nocommit: get this out of here and into the deprecated codecs (4.0, 4.2) + // TODO: get this out of here and into the deprecated codecs (4.0, 4.2) public void testHugeBinaryValueLimit() throws Exception { // We only test DVFormats that have a limit assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field")); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index d79c948ed7e..b1a43a0fa59 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -757,14 +757,13 @@ public class _TestUtil { } } - // nocommit: remove this, push this test to Lucene40/Lucene42 codec tests + // TODO: remove this, push this test to Lucene40/Lucene42 codec tests public static boolean fieldSupportsHugeBinaryDocValues(String field) { String dvFormat = getDocValuesFormat(field); - System.out.println(dvFormat); - return dvFormat.equals("Lucene45") || - dvFormat.equals("Asserting") || - dvFormat.equals("Disk") || - dvFormat.equals("SimpleText"); + if (dvFormat.equals("Lucene40") || dvFormat.equals("Lucene42")) { + return false; + } + return true; } public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException { From 28316a161c90216b50d63fcd3ba8084c99b895eb Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 19 Aug 2013 20:11:03 +0000 Subject: [PATCH 10/16] add memorydv with missing support git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515582 13f79535-47bb-0310-9956-ffa450edef68 --- .../memory/MemoryDocValuesConsumer.java | 403 +++++++++++ .../codecs/memory/MemoryDocValuesFormat.java | 72 ++ .../memory/MemoryDocValuesProducer.java | 633 ++++++++++++++++++ .../apache/lucene/codecs/memory/package.html | 2 +- .../org.apache.lucene.codecs.DocValuesFormat | 1 + .../memory/TestMemoryDocValuesFormat.java | 39 ++ .../org/apache/lucene/index/RandomCodec.java | 2 + .../org/apache/lucene/util/_TestUtil.java | 2 +- 8 files changed, 1152 insertions(+), 2 deletions(-) create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesFormat.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java create mode 100644 lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestMemoryDocValuesFormat.java diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java new file mode 100644 index 00000000000..2b3b9901fc5 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesConsumer.java @@ -0,0 +1,403 @@ +package org.apache.lucene.codecs.memory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.NoSuchElementException; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.MathUtil; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FST.INPUT_TYPE; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.BlockPackedWriter; +import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts.FormatAndBits; +import org.apache.lucene.util.packed.PackedInts; + +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.VERSION_CURRENT; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BLOCK_SIZE; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.BYTES; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.NUMBER; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.FST; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.DELTA_COMPRESSED; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.GCD_COMPRESSED; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.TABLE_COMPRESSED; +import static org.apache.lucene.codecs.memory.MemoryDocValuesProducer.UNCOMPRESSED; + +/** + * Writer for {@link MemoryDocValuesFormat} + */ +class MemoryDocValuesConsumer extends DocValuesConsumer { + final IndexOutput data, meta; + final int maxDoc; + final float acceptableOverheadRatio; + + MemoryDocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension, float acceptableOverheadRatio) throws IOException { + this.acceptableOverheadRatio = acceptableOverheadRatio; + maxDoc = state.segmentInfo.getDocCount(); + boolean success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.createOutput(dataName, state.context); + CodecUtil.writeHeader(data, dataCodec, VERSION_CURRENT); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + meta = state.directory.createOutput(metaName, state.context); + CodecUtil.writeHeader(meta, metaCodec, VERSION_CURRENT); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public void addNumericField(FieldInfo field, Iterable values) throws IOException { + addNumericField(field, values, true); + } + + void addNumericField(FieldInfo field, Iterable values, boolean optimizeStorage) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(NUMBER); + meta.writeLong(data.getFilePointer()); + long minValue = Long.MAX_VALUE; + long maxValue = Long.MIN_VALUE; + long gcd = 0; + boolean missing = false; + // TODO: more efficient? + HashSet uniqueValues = null; + if (optimizeStorage) { + uniqueValues = new HashSet<>(); + + long count = 0; + for (Number nv : values) { + final long v; + if (nv == null) { + v = 0; + missing = true; + } else { + v = nv.longValue(); + } + + if (gcd != 1) { + if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) { + // in that case v - minValue might overflow and make the GCD computation return + // wrong results. Since these extreme values are unlikely, we just discard + // GCD computation for them + gcd = 1; + } else if (count != 0) { // minValue needs to be set first + gcd = MathUtil.gcd(gcd, v - minValue); + } + } + + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + + if (uniqueValues != null) { + if (uniqueValues.add(v)) { + if (uniqueValues.size() > 256) { + uniqueValues = null; + } + } + } + + ++count; + } + assert count == maxDoc; + } + + if (missing) { + long start = data.getFilePointer(); + writeMissingBitset(values); + meta.writeLong(start); + meta.writeLong(data.getFilePointer() - start); + } else { + meta.writeLong(-1L); + } + + if (uniqueValues != null) { + // small number of unique values + final int bitsPerValue = PackedInts.bitsRequired(uniqueValues.size()-1); + FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); + if (formatAndBits.bitsPerValue == 8 && minValue >= Byte.MIN_VALUE && maxValue <= Byte.MAX_VALUE) { + meta.writeByte(UNCOMPRESSED); // uncompressed + for (Number nv : values) { + data.writeByte(nv == null ? 0 : (byte) nv.longValue()); + } + } else { + meta.writeByte(TABLE_COMPRESSED); // table-compressed + Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]); + final HashMap encode = new HashMap(); + data.writeVInt(decode.length); + for (int i = 0; i < decode.length; i++) { + data.writeLong(decode[i]); + encode.put(decode[i], i); + } + + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeVInt(formatAndBits.format.getId()); + data.writeVInt(formatAndBits.bitsPerValue); + + final PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE); + for(Number nv : values) { + writer.add(encode.get(nv == null ? 0 : nv.longValue())); + } + writer.finish(); + } + } else if (gcd != 0 && gcd != 1) { + meta.writeByte(GCD_COMPRESSED); + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeLong(minValue); + data.writeLong(gcd); + data.writeVInt(BLOCK_SIZE); + + final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); + for (Number nv : values) { + long value = nv == null ? 0 : nv.longValue(); + writer.add((value - minValue) / gcd); + } + writer.finish(); + } else { + meta.writeByte(DELTA_COMPRESSED); // delta-compressed + + meta.writeVInt(PackedInts.VERSION_CURRENT); + data.writeVInt(BLOCK_SIZE); + + final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE); + for (Number nv : values) { + writer.add(nv == null ? 0 : nv.longValue()); + } + writer.finish(); + } + } + + @Override + public void close() throws IOException { + boolean success = false; + try { + if (meta != null) { + meta.writeVInt(-1); // write EOF marker + } + success = true; + } finally { + if (success) { + IOUtils.close(data, meta); + } else { + IOUtils.closeWhileHandlingException(data, meta); + } + } + } + + @Override + public void addBinaryField(FieldInfo field, final Iterable values) throws IOException { + // write the byte[] data + meta.writeVInt(field.number); + meta.writeByte(BYTES); + int minLength = Integer.MAX_VALUE; + int maxLength = Integer.MIN_VALUE; + final long startFP = data.getFilePointer(); + boolean missing = false; + for(BytesRef v : values) { + final int length; + if (v == null) { + length = 0; + missing = true; + } else { + length = v.length; + } + if (length > MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH) { + throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH); + } + minLength = Math.min(minLength, length); + maxLength = Math.max(maxLength, length); + if (v != null) { + data.writeBytes(v.bytes, v.offset, v.length); + } + } + meta.writeLong(startFP); + meta.writeLong(data.getFilePointer() - startFP); + if (missing) { + long start = data.getFilePointer(); + writeMissingBitset(values); + meta.writeLong(start); + meta.writeLong(data.getFilePointer() - start); + } else { + meta.writeLong(-1L); + } + meta.writeVInt(minLength); + meta.writeVInt(maxLength); + + // if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit) + // otherwise, we need to record the length fields... + if (minLength != maxLength) { + meta.writeVInt(PackedInts.VERSION_CURRENT); + meta.writeVInt(BLOCK_SIZE); + + final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE); + long addr = 0; + for (BytesRef v : values) { + if (v != null) { + addr += v.length; + } + writer.add(addr); + } + writer.finish(); + } + } + + private void writeFST(FieldInfo field, Iterable values) throws IOException { + meta.writeVInt(field.number); + meta.writeByte(FST); + meta.writeLong(data.getFilePointer()); + PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); + Builder builder = new Builder(INPUT_TYPE.BYTE1, outputs); + IntsRef scratch = new IntsRef(); + long ord = 0; + for (BytesRef v : values) { + builder.add(Util.toIntsRef(v, scratch), ord); + ord++; + } + FST fst = builder.finish(); + if (fst != null) { + fst.save(data); + } + meta.writeVLong(ord); + } + + // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on, + // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode) + void writeMissingBitset(Iterable values) throws IOException { + long bits = 0; + int count = 0; + for (Object v : values) { + if (count == 64) { + data.writeLong(bits); + count = 0; + bits = 0; + } + if (v != null) { + bits |= 1L << (count & 0x3f); + } + count++; + } + if (count > 0) { + data.writeLong(bits); + } + } + + @Override + public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + // write the ordinals as numerics + addNumericField(field, docToOrd, false); + + // write the values as FST + writeFST(field, values); + } + + // note: this might not be the most efficient... but its fairly simple + @Override + public void addSortedSetField(FieldInfo field, Iterable values, final Iterable docToOrdCount, final Iterable ords) throws IOException { + // write the ordinals as a binary field + addBinaryField(field, new Iterable() { + @Override + public Iterator iterator() { + return new SortedSetIterator(docToOrdCount.iterator(), ords.iterator()); + } + }); + + // write the values as FST + writeFST(field, values); + } + + // per-document vint-encoded byte[] + static class SortedSetIterator implements Iterator { + byte[] buffer = new byte[10]; + ByteArrayDataOutput out = new ByteArrayDataOutput(); + BytesRef ref = new BytesRef(); + + final Iterator counts; + final Iterator ords; + + SortedSetIterator(Iterator counts, Iterator ords) { + this.counts = counts; + this.ords = ords; + } + + @Override + public boolean hasNext() { + return counts.hasNext(); + } + + @Override + public BytesRef next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + + int count = counts.next().intValue(); + int maxSize = count*9; // worst case + if (maxSize > buffer.length) { + buffer = ArrayUtil.grow(buffer, maxSize); + } + + try { + encodeValues(count); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + + ref.bytes = buffer; + ref.offset = 0; + ref.length = out.getPosition(); + + return ref; + } + + // encodes count values to buffer + private void encodeValues(int count) throws IOException { + out.reset(buffer); + long lastOrd = 0; + for (int i = 0; i < count; i++) { + long ord = ords.next().longValue(); + out.writeVLong(ord - lastOrd); + lastOrd = ord; + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesFormat.java new file mode 100644 index 00000000000..2f6216db08b --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesFormat.java @@ -0,0 +1,72 @@ +package org.apache.lucene.codecs.memory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.DocValuesConsumer; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.packed.PackedInts; + +/** In-memory docvalues format */ +public class MemoryDocValuesFormat extends DocValuesFormat { + + /** Maximum length for each binary doc values field. */ + public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2; + + final float acceptableOverheadRatio; + + /** + * Calls {@link #MemoryDocValuesFormat(float) + * MemoryDocValuesFormat(PackedInts.DEFAULT)} + */ + public MemoryDocValuesFormat() { + this(PackedInts.DEFAULT); + } + + /** + * Creates a new MemoryDocValuesFormat with the specified + * acceptableOverheadRatio for NumericDocValues. + * @param acceptableOverheadRatio compression parameter for numerics. + * Currently this is only used when the number of unique values is small. + * + * @lucene.experimental + */ + public MemoryDocValuesFormat(float acceptableOverheadRatio) { + super("Memory"); + this.acceptableOverheadRatio = acceptableOverheadRatio; + } + + @Override + public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new MemoryDocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION, acceptableOverheadRatio); + } + + @Override + public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { + return new MemoryDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION); + } + + static final String DATA_CODEC = "MemoryDocValuesData"; + static final String DATA_EXTENSION = "mdvd"; + static final String METADATA_CODEC = "MemoryDocValuesMetadata"; + static final String METADATA_EXTENSION = "mdvm"; +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java new file mode 100644 index 00000000000..46ed8b8e9b9 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryDocValuesProducer.java @@ -0,0 +1,633 @@ +package org.apache.lucene.codecs.memory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesProducer; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.fst.BytesRefFSTEnum; +import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.BlockPackedReader; +import org.apache.lucene.util.packed.MonotonicBlockPackedReader; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Reader for {@link MemoryDocValuesFormat} + */ +class MemoryDocValuesProducer extends DocValuesProducer { + // metadata maps (just file pointers and minimal stuff) + private final Map numerics; + private final Map binaries; + private final Map fsts; + private final IndexInput data; + + // ram instances we have already loaded + private final Map numericInstances = + new HashMap(); + private final Map binaryInstances = + new HashMap(); + private final Map> fstInstances = + new HashMap>(); + private final Map docsWithFieldInstances = new HashMap(); + + private final int maxDoc; + + + static final byte NUMBER = 0; + static final byte BYTES = 1; + static final byte FST = 2; + + static final int BLOCK_SIZE = 4096; + + static final byte DELTA_COMPRESSED = 0; + static final byte TABLE_COMPRESSED = 1; + static final byte UNCOMPRESSED = 2; + static final byte GCD_COMPRESSED = 3; + + static final int VERSION_START = 0; + static final int VERSION_GCD_COMPRESSION = 1; + static final int VERSION_CURRENT = VERSION_GCD_COMPRESSION; + + MemoryDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { + maxDoc = state.segmentInfo.getDocCount(); + String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); + // read in the entries from the metadata file. + IndexInput in = state.directory.openInput(metaName, state.context); + boolean success = false; + final int version; + try { + version = CodecUtil.checkHeader(in, metaCodec, + VERSION_START, + VERSION_CURRENT); + numerics = new HashMap(); + binaries = new HashMap(); + fsts = new HashMap(); + readFields(in, state.fieldInfos); + + success = true; + } finally { + if (success) { + IOUtils.close(in); + } else { + IOUtils.closeWhileHandlingException(in); + } + } + + success = false; + try { + String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); + data = state.directory.openInput(dataName, state.context); + final int version2 = CodecUtil.checkHeader(data, dataCodec, + VERSION_START, + VERSION_CURRENT); + if (version != version2) { + throw new CorruptIndexException("Format versions mismatch"); + } + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this.data); + } + } + } + + private void readFields(IndexInput meta, FieldInfos infos) throws IOException { + int fieldNumber = meta.readVInt(); + while (fieldNumber != -1) { + int fieldType = meta.readByte(); + if (fieldType == NUMBER) { + NumericEntry entry = new NumericEntry(); + entry.offset = meta.readLong(); + entry.missingOffset = meta.readLong(); + if (entry.missingOffset != -1) { + entry.missingBytes = meta.readLong(); + } else { + entry.missingBytes = 0; + } + entry.format = meta.readByte(); + switch(entry.format) { + case DELTA_COMPRESSED: + case TABLE_COMPRESSED: + case GCD_COMPRESSED: + case UNCOMPRESSED: + break; + default: + throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); + } + if (entry.format != UNCOMPRESSED) { + entry.packedIntsVersion = meta.readVInt(); + } + numerics.put(fieldNumber, entry); + } else if (fieldType == BYTES) { + BinaryEntry entry = new BinaryEntry(); + entry.offset = meta.readLong(); + entry.numBytes = meta.readLong(); + entry.missingOffset = meta.readLong(); + if (entry.missingOffset != -1) { + entry.missingBytes = meta.readLong(); + } else { + entry.missingBytes = 0; + } + entry.minLength = meta.readVInt(); + entry.maxLength = meta.readVInt(); + if (entry.minLength != entry.maxLength) { + entry.packedIntsVersion = meta.readVInt(); + entry.blockSize = meta.readVInt(); + } + binaries.put(fieldNumber, entry); + } else if (fieldType == FST) { + FSTEntry entry = new FSTEntry(); + entry.offset = meta.readLong(); + entry.numOrds = meta.readVLong(); + fsts.put(fieldNumber, entry); + } else { + throw new CorruptIndexException("invalid entry type: " + fieldType + ", input=" + meta); + } + fieldNumber = meta.readVInt(); + } + } + + @Override + public synchronized NumericDocValues getNumeric(FieldInfo field) throws IOException { + NumericDocValues instance = numericInstances.get(field.number); + if (instance == null) { + instance = loadNumeric(field); + numericInstances.put(field.number, instance); + } + return instance; + } + + private NumericDocValues loadNumeric(FieldInfo field) throws IOException { + NumericEntry entry = numerics.get(field.number); + data.seek(entry.offset + entry.missingBytes); + switch (entry.format) { + case TABLE_COMPRESSED: + int size = data.readVInt(); + if (size > 256) { + throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + data); + } + final long decode[] = new long[size]; + for (int i = 0; i < decode.length; i++) { + decode[i] = data.readLong(); + } + final int formatID = data.readVInt(); + final int bitsPerValue = data.readVInt(); + final PackedInts.Reader ordsReader = PackedInts.getReaderNoHeader(data, PackedInts.Format.byId(formatID), entry.packedIntsVersion, maxDoc, bitsPerValue); + return new NumericDocValues() { + @Override + public long get(int docID) { + return decode[(int)ordsReader.get(docID)]; + } + }; + case DELTA_COMPRESSED: + final int blockSize = data.readVInt(); + final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, blockSize, maxDoc, false); + return new NumericDocValues() { + @Override + public long get(int docID) { + return reader.get(docID); + } + }; + case UNCOMPRESSED: + final byte bytes[] = new byte[maxDoc]; + data.readBytes(bytes, 0, bytes.length); + return new NumericDocValues() { + @Override + public long get(int docID) { + return bytes[docID]; + } + }; + case GCD_COMPRESSED: + final long min = data.readLong(); + final long mult = data.readLong(); + final int quotientBlockSize = data.readVInt(); + final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, quotientBlockSize, maxDoc, false); + return new NumericDocValues() { + @Override + public long get(int docID) { + return min + mult * quotientReader.get(docID); + } + }; + default: + throw new AssertionError(); + } + } + + @Override + public synchronized BinaryDocValues getBinary(FieldInfo field) throws IOException { + BinaryDocValues instance = binaryInstances.get(field.number); + if (instance == null) { + instance = loadBinary(field); + binaryInstances.put(field.number, instance); + } + return instance; + } + + private BinaryDocValues loadBinary(FieldInfo field) throws IOException { + BinaryEntry entry = binaries.get(field.number); + data.seek(entry.offset); + PagedBytes bytes = new PagedBytes(16); + bytes.copy(data, entry.numBytes); + final PagedBytes.Reader bytesReader = bytes.freeze(true); + if (entry.minLength == entry.maxLength) { + final int fixedLength = entry.minLength; + return new BinaryDocValues() { + @Override + public void get(int docID, BytesRef result) { + bytesReader.fillSlice(result, fixedLength * (long)docID, fixedLength); + } + }; + } else { + data.seek(data.getFilePointer() + entry.missingBytes); + final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, maxDoc, false); + return new BinaryDocValues() { + @Override + public void get(int docID, BytesRef result) { + long startAddress = docID == 0 ? 0 : addresses.get(docID-1); + long endAddress = addresses.get(docID); + bytesReader.fillSlice(result, startAddress, (int) (endAddress - startAddress)); + } + }; + } + } + + @Override + public SortedDocValues getSorted(FieldInfo field) throws IOException { + final FSTEntry entry = fsts.get(field.number); + if (entry.numOrds == 0) { + return SortedDocValues.EMPTY; + } + FST instance; + synchronized(this) { + instance = fstInstances.get(field.number); + if (instance == null) { + data.seek(entry.offset); + instance = new FST(data, PositiveIntOutputs.getSingleton()); + fstInstances.put(field.number, instance); + } + } + final NumericDocValues docToOrd = getNumeric(field); + final FST fst = instance; + + // per-thread resources + final BytesReader in = fst.getBytesReader(); + final Arc firstArc = new Arc(); + final Arc scratchArc = new Arc(); + final IntsRef scratchInts = new IntsRef(); + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); + + return new SortedDocValues() { + @Override + public int getOrd(int docID) { + return (int) docToOrd.get(docID); + } + + @Override + public void lookupOrd(int ord, BytesRef result) { + try { + in.setPosition(0); + fst.getFirstArc(firstArc); + IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts); + result.bytes = new byte[output.length]; + result.offset = 0; + result.length = 0; + Util.toBytesRef(output, result); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + @Override + public int lookupTerm(BytesRef key) { + try { + InputOutput o = fstEnum.seekCeil(key); + if (o == null) { + return -getValueCount()-1; + } else if (o.input.equals(key)) { + return o.output.intValue(); + } else { + return (int) -o.output-1; + } + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + @Override + public int getValueCount() { + return (int)entry.numOrds; + } + + @Override + public TermsEnum termsEnum() { + return new FSTTermsEnum(fst); + } + }; + } + + @Override + public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { + final FSTEntry entry = fsts.get(field.number); + if (entry.numOrds == 0) { + return SortedSetDocValues.EMPTY; // empty FST! + } + FST instance; + synchronized(this) { + instance = fstInstances.get(field.number); + if (instance == null) { + data.seek(entry.offset); + instance = new FST(data, PositiveIntOutputs.getSingleton()); + fstInstances.put(field.number, instance); + } + } + final BinaryDocValues docToOrds = getBinary(field); + final FST fst = instance; + + // per-thread resources + final BytesReader in = fst.getBytesReader(); + final Arc firstArc = new Arc(); + final Arc scratchArc = new Arc(); + final IntsRef scratchInts = new IntsRef(); + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); + final BytesRef ref = new BytesRef(); + final ByteArrayDataInput input = new ByteArrayDataInput(); + return new SortedSetDocValues() { + long currentOrd; + + @Override + public long nextOrd() { + if (input.eof()) { + return NO_MORE_ORDS; + } else { + currentOrd += input.readVLong(); + return currentOrd; + } + } + + @Override + public void setDocument(int docID) { + docToOrds.get(docID, ref); + input.reset(ref.bytes, ref.offset, ref.length); + currentOrd = 0; + } + + @Override + public void lookupOrd(long ord, BytesRef result) { + try { + in.setPosition(0); + fst.getFirstArc(firstArc); + IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts); + result.bytes = new byte[output.length]; + result.offset = 0; + result.length = 0; + Util.toBytesRef(output, result); + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + @Override + public long lookupTerm(BytesRef key) { + try { + InputOutput o = fstEnum.seekCeil(key); + if (o == null) { + return -getValueCount()-1; + } else if (o.input.equals(key)) { + return o.output.intValue(); + } else { + return -o.output-1; + } + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + @Override + public long getValueCount() { + return entry.numOrds; + } + + @Override + public TermsEnum termsEnum() { + return new FSTTermsEnum(fst); + } + }; + } + + private Bits getMissingBits(int fieldNumber, final long offset, final long length) throws IOException { + if (offset == -1) { + return new Bits.MatchAllBits(maxDoc); + } else { + Bits instance; + synchronized(this) { + instance = docsWithFieldInstances.get(fieldNumber); + if (instance == null) { + IndexInput data = this.data.clone(); + data.seek(offset); + assert length % 8 == 0; + long bits[] = new long[(int) length >> 3]; + for (int i = 0; i < bits.length; i++) { + bits[i] = data.readLong(); + } + instance = new FixedBitSet(bits, maxDoc); + docsWithFieldInstances.put(fieldNumber, instance); + } + } + return instance; + } + } + + @Override + public Bits getDocsWithField(FieldInfo field) throws IOException { + switch(field.getDocValuesType()) { + case SORTED_SET: + return new SortedSetDocsWithField(getSortedSet(field), maxDoc); + case SORTED: + return new SortedDocsWithField(getSorted(field), maxDoc); + case BINARY: + BinaryEntry be = binaries.get(field.number); + return getMissingBits(field.number, be.missingOffset, be.missingBytes); + case NUMERIC: + NumericEntry ne = numerics.get(field.number); + return getMissingBits(field.number, ne.missingOffset, ne.missingBytes); + default: + throw new AssertionError(); + } + } + + @Override + public void close() throws IOException { + data.close(); + } + + static class NumericEntry { + long offset; + long missingOffset; + long missingBytes; + byte format; + int packedIntsVersion; + } + + static class BinaryEntry { + long offset; + long missingOffset; + long missingBytes; + long numBytes; + int minLength; + int maxLength; + int packedIntsVersion; + int blockSize; + } + + static class FSTEntry { + long offset; + long numOrds; + } + + // exposes FSTEnum directly as a TermsEnum: avoids binary-search next() + static class FSTTermsEnum extends TermsEnum { + final BytesRefFSTEnum in; + + // this is all for the complicated seek(ord)... + // maybe we should add a FSTEnum that supports this operation? + final FST fst; + final FST.BytesReader bytesReader; + final Arc firstArc = new Arc(); + final Arc scratchArc = new Arc(); + final IntsRef scratchInts = new IntsRef(); + final BytesRef scratchBytes = new BytesRef(); + + FSTTermsEnum(FST fst) { + this.fst = fst; + in = new BytesRefFSTEnum(fst); + bytesReader = fst.getBytesReader(); + } + + @Override + public BytesRef next() throws IOException { + InputOutput io = in.next(); + if (io == null) { + return null; + } else { + return io.input; + } + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + if (in.seekCeil(text) == null) { + return SeekStatus.END; + } else if (term().equals(text)) { + // TODO: add SeekStatus to FSTEnum like in https://issues.apache.org/jira/browse/LUCENE-3729 + // to remove this comparision? + return SeekStatus.FOUND; + } else { + return SeekStatus.NOT_FOUND; + } + } + + @Override + public boolean seekExact(BytesRef text) throws IOException { + if (in.seekExact(text) == null) { + return false; + } else { + return true; + } + } + + @Override + public void seekExact(long ord) throws IOException { + // TODO: would be better to make this simpler and faster. + // but we dont want to introduce a bug that corrupts our enum state! + bytesReader.setPosition(0); + fst.getFirstArc(firstArc); + IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); + scratchBytes.bytes = new byte[output.length]; + scratchBytes.offset = 0; + scratchBytes.length = 0; + Util.toBytesRef(output, scratchBytes); + // TODO: we could do this lazily, better to try to push into FSTEnum though? + in.seekExact(scratchBytes); + } + + @Override + public BytesRef term() throws IOException { + return in.current().input; + } + + @Override + public long ord() throws IOException { + return in.current().output; + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/package.html b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/package.html index 340e8316908..468cc07257e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/package.html +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/package.html @@ -20,6 +20,6 @@ -Postings format that is read entirely into memory. +Postings and DocValues formats that are read entirely into memory. \ No newline at end of file diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat index e289c4d0966..5103c529355 100644 --- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat +++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat @@ -14,4 +14,5 @@ # limitations under the License. org.apache.lucene.codecs.diskdv.DiskDocValuesFormat +org.apache.lucene.codecs.memory.MemoryDocValuesFormat org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestMemoryDocValuesFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestMemoryDocValuesFormat.java new file mode 100644 index 00000000000..77c6ea582a8 --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestMemoryDocValuesFormat.java @@ -0,0 +1,39 @@ +package org.apache.lucene.codecs.memory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase; +import org.apache.lucene.util._TestUtil; + +/** + * Tests MemoryDocValuesFormat + */ +public class TestMemoryDocValuesFormat extends BaseCompressingDocValuesFormatTestCase { + private final Codec codec = _TestUtil.alwaysDocValuesFormat(new MemoryDocValuesFormat()); + + @Override + protected Codec getCodec() { + return codec; + } + + @Override + protected boolean codecAcceptsHugeBinaryValues(String field) { + return false; + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index 239be0f34c7..3530a31ce21 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -40,6 +40,7 @@ import org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat; import org.apache.lucene.codecs.bloom.TestBloomFilteredLucene41Postings; import org.apache.lucene.codecs.diskdv.DiskDocValuesFormat; import org.apache.lucene.codecs.memory.DirectPostingsFormat; +import org.apache.lucene.codecs.memory.MemoryDocValuesFormat; import org.apache.lucene.codecs.memory.MemoryPostingsFormat; import org.apache.lucene.codecs.mockintblock.MockFixedIntBlockPostingsFormat; import org.apache.lucene.codecs.mockintblock.MockVariableIntBlockPostingsFormat; @@ -149,6 +150,7 @@ public class RandomCodec extends Lucene45Codec { addDocValues(avoidCodecs, new Lucene45DocValuesFormat(), new DiskDocValuesFormat(), + new MemoryDocValuesFormat(), new SimpleTextDocValuesFormat(), new AssertingDocValuesFormat()); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index b1a43a0fa59..19c16bcd9bc 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -760,7 +760,7 @@ public class _TestUtil { // TODO: remove this, push this test to Lucene40/Lucene42 codec tests public static boolean fieldSupportsHugeBinaryDocValues(String field) { String dvFormat = getDocValuesFormat(field); - if (dvFormat.equals("Lucene40") || dvFormat.equals("Lucene42")) { + if (dvFormat.equals("Lucene40") || dvFormat.equals("Lucene42") || dvFormat.equals("Memory")) { return false; } return true; From 863c2191226ce9a791ee693795e0d211f9d7419d Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 19 Aug 2013 21:08:42 +0000 Subject: [PATCH 11/16] add checks git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515605 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/index/AssertingAtomicReader.java | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java index eb6b20df5eb..1bdb14cf574 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java @@ -608,6 +608,32 @@ public class AssertingAtomicReader extends FilterAtomicReader { } } + @Override + public Bits getLiveDocs() { + Bits liveDocs = super.getLiveDocs(); + if (liveDocs != null) { + assert maxDoc() == liveDocs.length(); + } else { + assert maxDoc() == numDocs(); + assert !hasDeletions(); + } + return liveDocs; + } + + @Override + public Bits getDocsWithField(String field) throws IOException { + Bits docsWithField = super.getDocsWithField(field); + FieldInfo fi = getFieldInfos().fieldInfo(field); + if (docsWithField != null) { + assert fi != null; + assert fi.hasDocValues(); + assert maxDoc() == docsWithField.length(); + } else { + assert fi == null || fi.hasDocValues() == false; + } + return docsWithField; + } + // this is the same hack as FCInvisible @Override public Object getCoreCacheKey() { From bd5ca555fea6af0ab4a59ec4587137265bccc1b1 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 19 Aug 2013 21:27:45 +0000 Subject: [PATCH 12/16] improve tests git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515613 13f79535-47bb-0310-9956-ffa450edef68 --- .../index/BaseDocValuesFormatTestCase.java | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index c1902cd93eb..8dfaf0cec3f 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -1232,6 +1232,73 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { dir.close(); } + private void doTestMissingVsFieldCache(final long minValue, final long maxValue) throws Exception { + doTestMissingVsFieldCache(new LongProducer() { + @Override + long next() { + return _TestUtil.nextLong(random(), minValue, maxValue); + } + }); + } + + private void doTestMissingVsFieldCache(LongProducer longs) throws Exception { + assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv")); + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Field idField = new StringField("id", "", Field.Store.NO); + Field indexedField = newStringField("indexed", "", Field.Store.NO); + Field dvField = new NumericDocValuesField("dv", 0); + + + // index some docs + int numDocs = atLeast(300); + // numDocs should be always > 256 so that in case of a codec that optimizes + // for numbers of values <= 256, all storage layouts are tested + assert numDocs > 256; + for (int i = 0; i < numDocs; i++) { + idField.setStringValue(Integer.toString(i)); + long value = longs.next(); + indexedField.setStringValue(Long.toString(value)); + dvField.setLongValue(value); + Document doc = new Document(); + doc.add(idField); + // 1/4 of the time we neglect to add the fields + if (random().nextInt(4) > 0) { + doc.add(indexedField); + doc.add(dvField); + } + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs/10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + + // merge some segments and ensure that at least one of them has more than + // 256 values + writer.forceMerge(numDocs / 256); + + writer.close(); + + // compare + DirectoryReader ir = DirectoryReader.open(dir); + for (AtomicReaderContext context : ir.leaves()) { + AtomicReader r = context.reader(); + Bits expected = FieldCache.DEFAULT.getDocsWithField(r, "indexed"); + Bits actual = FieldCache.DEFAULT.getDocsWithField(r, "dv"); + assertEquals(expected, actual); + } + ir.close(); + dir.close(); + } + public void testBooleanNumericsVsStoredFields() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { @@ -1246,6 +1313,13 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } } + public void testByteMissingVsFieldCache() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestMissingVsFieldCache(Byte.MIN_VALUE, Byte.MAX_VALUE); + } + } + public void testShortNumericsVsStoredFields() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { @@ -1253,6 +1327,13 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } } + public void testShortMissingVsFieldCache() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestMissingVsFieldCache(Short.MIN_VALUE, Short.MAX_VALUE); + } + } + public void testIntNumericsVsStoredFields() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { @@ -1260,6 +1341,13 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } } + public void testIntMissingVsFieldCache() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestMissingVsFieldCache(Integer.MIN_VALUE, Integer.MAX_VALUE); + } + } + public void testLongNumericsVsStoredFields() throws Exception { int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { @@ -1267,6 +1355,13 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } } + public void testLongMissingVsFieldCache() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestMissingVsFieldCache(Long.MIN_VALUE, Long.MAX_VALUE); + } + } + private void doTestBinaryVsStoredFields(int minLength, int maxLength) throws Exception { Directory dir = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); @@ -2014,6 +2109,13 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { doTestSortedSetVsStoredFields(1, 10); } } + + private void assertEquals(Bits expected, Bits actual) throws Exception { + assertEquals(expected.length(), actual.length()); + for (int i = 0; i < expected.length(); i++) { + assertEquals(expected.get(i), actual.get(i)); + } + } private void assertEquals(int maxDoc, SortedDocValues expected, SortedDocValues actual) throws Exception { assertEquals(maxDoc, new SingletonSortedSetDocValues(expected), new SingletonSortedSetDocValues(actual)); From cf0078a6861051d9cf46d1981b360ac7520e28b1 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 20 Aug 2013 02:05:12 +0000 Subject: [PATCH 13/16] beef up tests git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515686 13f79535-47bb-0310-9956-ffa450edef68 --- .../simpletext/SimpleTextDocValuesReader.java | 22 +- .../asserting/AssertingDocValuesFormat.java | 88 ++++-- .../asserting/AssertingNormsFormat.java | 4 +- .../lucene/index/AssertingAtomicReader.java | 22 ++ .../index/BaseDocValuesFormatTestCase.java | 258 +++++++++++++++++- .../apache/lucene/util/LuceneTestCase.java | 8 +- 6 files changed, 352 insertions(+), 50 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java index 9ead984bb84..3753a62193c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java @@ -28,7 +28,6 @@ import java.util.Locale; import java.util.Map; import org.apache.lucene.codecs.DocValuesProducer; -import org.apache.lucene.codecs.DocValuesProducer.SortedSetDocsWithField; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; @@ -431,16 +430,17 @@ class SimpleTextDocValuesReader extends DocValuesProducer { @Override public Bits getDocsWithField(FieldInfo field) throws IOException { - if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) { - return new SortedSetDocsWithField(getSortedSet(field), maxDoc); - } else if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED) { - return new SortedDocsWithField(getSorted(field), maxDoc); - } else if (field.getDocValuesType() == FieldInfo.DocValuesType.BINARY) { - return getBinaryDocsWithField(field); - } else if (field.getDocValuesType() == FieldInfo.DocValuesType.NUMERIC) { - return getNumericDocsWithField(field); - } else { - return new Bits.MatchAllBits(maxDoc); + switch (field.getDocValuesType()) { + case SORTED_SET: + return new SortedSetDocsWithField(getSortedSet(field), maxDoc); + case SORTED: + return new SortedDocsWithField(getSorted(field), maxDoc); + case BINARY: + return getBinaryDocsWithField(field); + case NUMERIC: + return getNumericDocsWithField(field); + default: + throw new AssertionError(); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java index 298d7aaf011..f3525a43189 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingDocValuesFormat.java @@ -175,28 +175,6 @@ public class AssertingDocValuesFormat extends DocValuesFormat { checkIterator(ords.iterator(), ordCount, false); in.addSortedSetField(field, values, docToOrdCount, ords); } - - private void checkIterator(Iterator iterator, long expectedSize, boolean allowNull) { - for (long i = 0; i < expectedSize; i++) { - boolean hasNext = iterator.hasNext(); - assert hasNext; - T v = iterator.next(); - assert allowNull || v != null; - try { - iterator.remove(); - throw new AssertionError("broken iterator (supports remove): " + iterator); - } catch (UnsupportedOperationException expected) { - // ok - } - } - assert !iterator.hasNext(); - try { - iterator.next(); - throw new AssertionError("broken iterator (allows next() when hasNext==false) " + iterator); - } catch (NoSuchElementException expected) { - // ok - } - } @Override public void close() throws IOException { @@ -204,6 +182,70 @@ public class AssertingDocValuesFormat extends DocValuesFormat { } } + static class AssertingNormsConsumer extends DocValuesConsumer { + private final DocValuesConsumer in; + private final int maxDoc; + + AssertingNormsConsumer(DocValuesConsumer in, int maxDoc) { + this.in = in; + this.maxDoc = maxDoc; + } + + @Override + public void addNumericField(FieldInfo field, Iterable values) throws IOException { + int count = 0; + for (Number v : values) { + assert v != null; + count++; + } + assert count == maxDoc; + checkIterator(values.iterator(), maxDoc, false); + in.addNumericField(field, values); + } + + @Override + public void close() throws IOException { + in.close(); + } + + @Override + public void addBinaryField(FieldInfo field, Iterable values) throws IOException { + throw new IllegalStateException(); + } + + @Override + public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException { + throw new IllegalStateException(); + } + + @Override + public void addSortedSetField(FieldInfo field, Iterable values, Iterable docToOrdCount, Iterable ords) throws IOException { + throw new IllegalStateException(); + } + } + + private static void checkIterator(Iterator iterator, long expectedSize, boolean allowNull) { + for (long i = 0; i < expectedSize; i++) { + boolean hasNext = iterator.hasNext(); + assert hasNext; + T v = iterator.next(); + assert allowNull || v != null; + try { + iterator.remove(); + throw new AssertionError("broken iterator (supports remove): " + iterator); + } catch (UnsupportedOperationException expected) { + // ok + } + } + assert !iterator.hasNext(); + try { + iterator.next(); + throw new AssertionError("broken iterator (allows next() when hasNext==false) " + iterator); + } catch (NoSuchElementException expected) { + // ok + } + } + static class AssertingDocValuesProducer extends DocValuesProducer { private final DocValuesProducer in; private final int maxDoc; @@ -252,7 +294,7 @@ public class AssertingDocValuesFormat extends DocValuesFormat { Bits bits = in.getDocsWithField(field); assert bits != null; assert bits.length() == maxDoc; - return bits; // TODO: add AssertingBits w/ bounds check + return new AssertingAtomicReader.AssertingBits(bits); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java index 8b64401b452..b7662674133 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingNormsFormat.java @@ -22,7 +22,7 @@ import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.NormsFormat; -import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat.AssertingDocValuesConsumer; +import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat.AssertingNormsConsumer; import org.apache.lucene.codecs.asserting.AssertingDocValuesFormat.AssertingDocValuesProducer; import org.apache.lucene.codecs.lucene42.Lucene42NormsFormat; import org.apache.lucene.index.SegmentReadState; @@ -38,7 +38,7 @@ public class AssertingNormsFormat extends NormsFormat { public DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException { DocValuesConsumer consumer = in.normsConsumer(state); assert consumer != null; - return new AssertingDocValuesConsumer(consumer, state.segmentInfo.getDocCount()); + return new AssertingNormsConsumer(consumer, state.segmentInfo.getDocCount()); } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java index 1bdb14cf574..086cb21fb89 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java @@ -607,12 +607,33 @@ public class AssertingAtomicReader extends FilterAtomicReader { return null; } } + + /** Wraps a Bits but with additional asserts */ + public static class AssertingBits implements Bits { + final Bits in; + + public AssertingBits(Bits in) { + this.in = in; + } + + @Override + public boolean get(int index) { + assert index >= 0 && index < length(); + return in.get(index); + } + + @Override + public int length() { + return in.length(); + } + } @Override public Bits getLiveDocs() { Bits liveDocs = super.getLiveDocs(); if (liveDocs != null) { assert maxDoc() == liveDocs.length(); + liveDocs = new AssertingBits(liveDocs); } else { assert maxDoc() == numDocs(); assert !hasDeletions(); @@ -628,6 +649,7 @@ public class AssertingAtomicReader extends FilterAtomicReader { assert fi != null; assert fi.hasDocValues(); assert maxDoc() == docsWithField.length(); + docsWithField = new AssertingBits(docsWithField); } else { assert fi == null || fi.hasDocValues() == false; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index 8dfaf0cec3f..9c94bb61567 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -26,6 +26,7 @@ import java.util.Map.Entry; import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.concurrent.CountDownLatch; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; @@ -673,7 +674,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { iwriter.close(); SortedDocValues dv = getOnlySegmentReader(ireader).getSortedDocValues("field"); - if (codecSupportsDocsWithField("field")) { + if (defaultCodecSupportsDocsWithField()) { assertEquals(-1, dv.getOrd(0)); assertEquals(0, dv.getValueCount()); } else { @@ -733,7 +734,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { BytesRef scratch = new BytesRef(); dv.lookupOrd(dv.getOrd(0), scratch); assertEquals(new BytesRef("hello world 2"), scratch); - if (codecSupportsDocsWithField("dv")) { + if (defaultCodecSupportsDocsWithField()) { assertEquals(-1, dv.getOrd(1)); } dv.get(1, scratch); @@ -1115,7 +1116,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { doc.add(newTextField("id", "noValue", Field.Store.YES)); w.addDocument(doc); } - if (!codecSupportsDocsWithField("field")) { + if (!defaultCodecSupportsDocsWithField()) { BytesRef bytesRef = new BytesRef(); hash.add(bytesRef); // add empty value for the gaps } @@ -1242,7 +1243,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } private void doTestMissingVsFieldCache(LongProducer longs) throws Exception { - assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv")); + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); Directory dir = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); @@ -2345,7 +2346,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } public void testTwoNumbersOneMissing() throws IOException { - assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); Directory directory = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); conf.setMergePolicy(newLogMergePolicy()); @@ -2374,7 +2375,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } public void testTwoNumbersOneMissingWithMerging() throws IOException { - assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); Directory directory = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); conf.setMergePolicy(newLogMergePolicy()); @@ -2404,7 +2405,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } public void testThreeNumbersOneMissingWithMerging() throws IOException { - assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); Directory directory = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); conf.setMergePolicy(newLogMergePolicy()); @@ -2440,7 +2441,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } public void testTwoBytesOneMissing() throws IOException { - assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); Directory directory = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); conf.setMergePolicy(newLogMergePolicy()); @@ -2472,7 +2473,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } public void testTwoBytesOneMissingWithMerging() throws IOException { - assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); Directory directory = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); conf.setMergePolicy(newLogMergePolicy()); @@ -2505,7 +2506,7 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { } public void testThreeBytesOneMissingWithMerging() throws IOException { - assumeTrue("Codec does not support getDocsWithField", codecSupportsDocsWithField("dv1")); + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); Directory directory = newDirectory(); IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); conf.setMergePolicy(newLogMergePolicy()); @@ -2709,6 +2710,243 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { ar.close(); d.close(); } + + /** Tests dv against stored fields with threads (binary/numeric/sorted, no missing) */ + public void testThreads() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Document doc = new Document(); + Field idField = new StringField("id", "", Field.Store.NO); + Field storedBinField = new StoredField("storedBin", new byte[0]); + Field dvBinField = new BinaryDocValuesField("dvBin", new BytesRef()); + Field dvSortedField = new SortedDocValuesField("dvSorted", new BytesRef()); + Field storedNumericField = new StoredField("storedNum", ""); + Field dvNumericField = new NumericDocValuesField("dvNum", 0); + doc.add(idField); + doc.add(storedBinField); + doc.add(dvBinField); + doc.add(dvSortedField); + doc.add(storedNumericField); + doc.add(dvNumericField); + + // index some docs + int numDocs = atLeast(300); + for (int i = 0; i < numDocs; i++) { + idField.setStringValue(Integer.toString(i)); + int length = _TestUtil.nextInt(random(), 0, 8); + byte buffer[] = new byte[length]; + random().nextBytes(buffer); + storedBinField.setBytesValue(buffer); + dvBinField.setBytesValue(buffer); + dvSortedField.setBytesValue(buffer); + long numericValue = random().nextLong(); + storedNumericField.setStringValue(Long.toString(numericValue)); + dvNumericField.setLongValue(numericValue); + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs/10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + writer.close(); + + // compare + final DirectoryReader ir = DirectoryReader.open(dir); + int numThreads = _TestUtil.nextInt(random(), 2, 7); + Thread threads[] = new Thread[numThreads]; + final CountDownLatch startingGun = new CountDownLatch(1); + + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread() { + @Override + public void run() { + try { + startingGun.await(); + for (AtomicReaderContext context : ir.leaves()) { + AtomicReader r = context.reader(); + BinaryDocValues binaries = r.getBinaryDocValues("dvBin"); + SortedDocValues sorted = r.getSortedDocValues("dvSorted"); + NumericDocValues numerics = r.getNumericDocValues("dvNum"); + for (int j = 0; j < r.maxDoc(); j++) { + BytesRef binaryValue = r.document(j).getBinaryValue("storedBin"); + BytesRef scratch = new BytesRef(); + binaries.get(j, scratch); + assertEquals(binaryValue, scratch); + sorted.get(j, scratch); + assertEquals(binaryValue, scratch); + String expected = r.document(j).get("storedNum"); + assertEquals(Long.parseLong(expected), numerics.get(j)); + } + } + _TestUtil.checkReader(ir); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + }; + threads[i].start(); + } + startingGun.countDown(); + for (Thread t : threads) { + t.join(); + } + ir.close(); + dir.close(); + } + + /** Tests dv against stored fields with threads (all types + missing) */ + public void testThreads2() throws Exception { + assumeTrue("Codec does not support getDocsWithField", defaultCodecSupportsDocsWithField()); + assumeTrue("Codec does not support SORTED_SET", defaultCodecSupportsSortedSet()); + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Field idField = new StringField("id", "", Field.Store.NO); + Field storedBinField = new StoredField("storedBin", new byte[0]); + Field dvBinField = new BinaryDocValuesField("dvBin", new BytesRef()); + Field dvSortedField = new SortedDocValuesField("dvSorted", new BytesRef()); + Field storedNumericField = new StoredField("storedNum", ""); + Field dvNumericField = new NumericDocValuesField("dvNum", 0); + + // index some docs + int numDocs = atLeast(300); + for (int i = 0; i < numDocs; i++) { + idField.setStringValue(Integer.toString(i)); + int length = _TestUtil.nextInt(random(), 0, 8); + byte buffer[] = new byte[length]; + random().nextBytes(buffer); + storedBinField.setBytesValue(buffer); + dvBinField.setBytesValue(buffer); + dvSortedField.setBytesValue(buffer); + long numericValue = random().nextLong(); + storedNumericField.setStringValue(Long.toString(numericValue)); + dvNumericField.setLongValue(numericValue); + Document doc = new Document(); + doc.add(idField); + if (random().nextInt(4) > 0) { + doc.add(storedBinField); + doc.add(dvBinField); + doc.add(dvSortedField); + } + if (random().nextInt(4) > 0) { + doc.add(storedNumericField); + doc.add(dvNumericField); + } + int numSortedSetFields = random().nextInt(3); + Set values = new TreeSet(); + for (int j = 0; j < numSortedSetFields; j++) { + values.add(_TestUtil.randomSimpleString(random())); + } + for (String v : values) { + doc.add(new SortedSetDocValuesField("dvSortedSet", new BytesRef(v))); + doc.add(new StoredField("storedSortedSet", v)); + } + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs/10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + writer.close(); + + // compare + final DirectoryReader ir = DirectoryReader.open(dir); + int numThreads = _TestUtil.nextInt(random(), 2, 7); + Thread threads[] = new Thread[numThreads]; + final CountDownLatch startingGun = new CountDownLatch(1); + + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread() { + @Override + public void run() { + try { + startingGun.await(); + for (AtomicReaderContext context : ir.leaves()) { + AtomicReader r = context.reader(); + BinaryDocValues binaries = r.getBinaryDocValues("dvBin"); + Bits binaryBits = r.getDocsWithField("dvBin"); + SortedDocValues sorted = r.getSortedDocValues("dvSorted"); + Bits sortedBits = r.getDocsWithField("dvSorted"); + NumericDocValues numerics = r.getNumericDocValues("dvNum"); + Bits numericBits = r.getDocsWithField("dvNum"); + SortedSetDocValues sortedSet = r.getSortedSetDocValues("dvSortedSet"); + Bits sortedSetBits = r.getDocsWithField("dvSortedSet"); + for (int j = 0; j < r.maxDoc(); j++) { + BytesRef binaryValue = r.document(j).getBinaryValue("storedBin"); + if (binaryValue != null) { + if (binaries != null) { + BytesRef scratch = new BytesRef(); + binaries.get(j, scratch); + assertEquals(binaryValue, scratch); + sorted.get(j, scratch); + assertEquals(binaryValue, scratch); + assertTrue(binaryBits.get(j)); + assertTrue(sortedBits.get(j)); + } + } else if (binaries != null) { + assertFalse(binaryBits.get(j)); + assertFalse(sortedBits.get(j)); + assertEquals(-1, sorted.getOrd(j)); + } + + String number = r.document(j).get("storedNum"); + if (number != null) { + if (numerics != null) { + assertEquals(Long.parseLong(number), numerics.get(j)); + } + } else if (numerics != null) { + assertFalse(numericBits.get(j)); + assertEquals(0, numerics.get(j)); + } + + String values[] = r.document(j).getValues("storedSortedSet"); + if (values.length > 0) { + assertNotNull(sortedSet); + sortedSet.setDocument(j); + for (int i = 0; i < values.length; i++) { + long ord = sortedSet.nextOrd(); + assertTrue(ord != SortedSetDocValues.NO_MORE_ORDS); + BytesRef value = new BytesRef(); + sortedSet.lookupOrd(ord, value); + assertEquals(values[i], value.utf8ToString()); + } + assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSet.nextOrd()); + assertTrue(sortedSetBits.get(j)); + } else if (sortedSet != null) { + sortedSet.setDocument(j); + assertEquals(SortedSetDocValues.NO_MORE_ORDS, sortedSet.nextOrd()); + assertFalse(sortedSetBits.get(j)); + } + } + } + _TestUtil.checkReader(ir); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + }; + threads[i].start(); + } + startingGun.countDown(); + for (Thread t : threads) { + t.join(); + } + ir.close(); + dir.close(); + } protected boolean codecAcceptsHugeBinaryValues(String field) { return true; diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java index 27f736b8706..6f9f2b3e749 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java @@ -1369,11 +1369,11 @@ public abstract class LuceneTestCase extends Assert { return true; } - /** Returns true if the codec for the field "supports" docsWithField + /** Returns true if the codec "supports" docsWithField * (other codecs return MatchAllBits, because you couldnt write missing values before) */ - public static boolean codecSupportsDocsWithField(String field) { - String name = _TestUtil.getDocValuesFormat(Codec.getDefault(), field); - if (name.equals("Lucene40") || name.equals("Lucene42")) { + public static boolean defaultCodecSupportsDocsWithField() { + String name = Codec.getDefault().getName(); + if (name.equals("Lucene40") || name.equals("Lucene41") || name.equals("Lucene42")) { return false; } return true; From 424369f49140e3a69e9784013f23e52b57c0be23 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 20 Aug 2013 02:46:13 +0000 Subject: [PATCH 14/16] add test git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515689 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/index/TestMultiDocValues.java | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java b/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java index 78c8974c555..8944dd66b67 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestMultiDocValues.java @@ -26,6 +26,7 @@ import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -321,4 +322,52 @@ public class TestMultiDocValues extends LuceneTestCase { ir2.close(); dir.close(); } + + public void testDocsWithField() throws Exception { + assumeTrue("codec does not support docsWithField", defaultCodecSupportsDocsWithField()); + Directory dir = newDirectory(); + + IndexWriterConfig iwc = newIndexWriterConfig(random(), TEST_VERSION_CURRENT, null); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + int numDocs = atLeast(500); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + if (random().nextInt(4) >= 0) { + doc.add(new NumericDocValuesField("numbers", random().nextLong())); + } + doc.add(new NumericDocValuesField("numbersAlways", random().nextLong())); + iw.addDocument(doc); + if (random().nextInt(17) == 0) { + iw.commit(); + } + } + DirectoryReader ir = iw.getReader(); + iw.forceMerge(1); + DirectoryReader ir2 = iw.getReader(); + AtomicReader merged = getOnlySegmentReader(ir2); + iw.close(); + + Bits multi = MultiDocValues.getDocsWithField(ir, "numbers"); + Bits single = merged.getDocsWithField("numbers"); + if (multi == null) { + assertNull(single); + } else { + assertEquals(single.length(), multi.length()); + for (int i = 0; i < numDocs; i++) { + assertEquals(single.get(i), multi.get(i)); + } + } + + multi = MultiDocValues.getDocsWithField(ir, "numbersAlways"); + single = merged.getDocsWithField("numbersAlways"); + assertEquals(single.length(), multi.length()); + for (int i = 0; i < numDocs; i++) { + assertEquals(single.get(i), multi.get(i)); + } + ir.close(); + ir2.close(); + dir.close(); + } } From d15e95bd050c26b7c15cc57303d4a0cd52089e68 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 20 Aug 2013 16:50:56 +0000 Subject: [PATCH 15/16] beef up TestDuelingCodecs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515870 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/index/TestDuelingCodecs.java | 6 ++++++ .../org/apache/lucene/util/LuceneTestCase.java | 14 ++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java index e222258bf0a..25fef8ca367 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDuelingCodecs.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.Document; +import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; @@ -134,6 +135,11 @@ public class TestDuelingCodecs extends LuceneTestCase { for (String trash : split) { document.add(new SortedSetDocValuesField("sortedset", new BytesRef(trash))); } + // add a numeric dv field sometimes + document.removeFields("sparsenumeric"); + if (random.nextInt(4) == 2) { + document.add(new NumericDocValuesField("sparsenumeric", random.nextInt())); + } writer.addDocument(document); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java index 6f9f2b3e749..1ca13974195 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java @@ -1972,6 +1972,20 @@ public abstract class LuceneTestCase extends Assert { assertNull(info, rightValues); } } + + { + Bits leftBits = MultiDocValues.getDocsWithField(leftReader, field); + Bits rightBits = MultiDocValues.getDocsWithField(rightReader, field); + if (leftBits != null && rightBits != null) { + assertEquals(info, leftBits.length(), rightBits.length()); + for (int i = 0; i < leftBits.length(); i++) { + assertEquals(info, leftBits.get(i), rightBits.get(i)); + } + } else { + assertNull(info, leftBits); + assertNull(info, rightBits); + } + } } } From 2bbc869516c222ad0ddc56838a3c32010d0d46cc Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 20 Aug 2013 20:37:06 +0000 Subject: [PATCH 16/16] add a solr test for missing dv git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5178@1515959 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/solr/schema/SchemaField.java | 12 +- .../conf/schema-docValuesMissing.xml | 61 +++ .../solr/schema/DocValuesMissingTest.java | 479 ++++++++++++++++++ 3 files changed, 550 insertions(+), 2 deletions(-) create mode 100644 solr/core/src/test-files/solr/collection1/conf/schema-docValuesMissing.xml create mode 100644 solr/core/src/test/org/apache/solr/schema/DocValuesMissingTest.java diff --git a/solr/core/src/java/org/apache/solr/schema/SchemaField.java b/solr/core/src/java/org/apache/solr/schema/SchemaField.java index 5a1fb5b82aa..ca6bd90d1b5 100644 --- a/solr/core/src/java/org/apache/solr/schema/SchemaField.java +++ b/solr/core/src/java/org/apache/solr/schema/SchemaField.java @@ -239,13 +239,21 @@ public final class SchemaField extends FieldProperties { if (on(falseProps,INDEXED)) { int pp = (INDEXED - | STORE_TERMVECTORS | STORE_TERMPOSITIONS | STORE_TERMOFFSETS - | SORT_MISSING_FIRST | SORT_MISSING_LAST); + | STORE_TERMVECTORS | STORE_TERMPOSITIONS | STORE_TERMOFFSETS); if (on(pp,trueProps)) { throw new RuntimeException("SchemaField: " + name + " conflicting 'true' field options for non-indexed field:" + props); } p &= ~pp; } + + if (on(falseProps,INDEXED) && on(falseProps,DOC_VALUES)) { + int pp = (SORT_MISSING_FIRST | SORT_MISSING_LAST); + if (on(pp,trueProps)) { + throw new RuntimeException("SchemaField: " + name + " conflicting 'true' field options for non-indexed/non-docValues field:" + props); + } + p &= ~pp; + } + if (on(falseProps,INDEXED)) { int pp = (OMIT_NORMS | OMIT_TF_POSITIONS | OMIT_POSITIONS); if (on(pp,falseProps)) { diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-docValuesMissing.xml b/solr/core/src/test-files/solr/collection1/conf/schema-docValuesMissing.xml new file mode 100644 index 00000000000..05f73e15f29 --- /dev/null +++ b/solr/core/src/test-files/solr/collection1/conf/schema-docValuesMissing.xml @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + diff --git a/solr/core/src/test/org/apache/solr/schema/DocValuesMissingTest.java b/solr/core/src/test/org/apache/solr/schema/DocValuesMissingTest.java new file mode 100644 index 00000000000..a0dc27bc70c --- /dev/null +++ b/solr/core/src/test/org/apache/solr/schema/DocValuesMissingTest.java @@ -0,0 +1,479 @@ +package org.apache.solr.schema; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; +import org.apache.solr.SolrTestCaseJ4; +import org.junit.BeforeClass; + +/** + * Tests things like sorting on docvalues with missing values + */ +@SuppressCodecs({"Lucene40", "Lucene41", "Lucene42"}) // old formats cannot represent missing values +public class DocValuesMissingTest extends SolrTestCaseJ4 { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig-basic.xml", "schema-docValuesMissing.xml"); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + clearIndex(); + assertU(commit()); + } + + /** float with default lucene sort (treats as 0) */ + public void testFloatSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "floatdv", "-1.3")); + assertU(adoc("id", "2", "floatdv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "floatdv asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "floatdv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** float with sort missing always first */ + public void testFloatSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "floatdv_missingfirst", "-1.3")); + assertU(adoc("id", "2", "floatdv_missingfirst", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "floatdv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "floatdv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** float with sort missing always last */ + public void testFloatSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "floatdv_missinglast", "-1.3")); + assertU(adoc("id", "2", "floatdv_missinglast", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "floatdv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "floatdv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** float function query based on missing */ + public void testFloatMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "floatdv", "-1.3")); + assertU(adoc("id", "2", "floatdv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(floatdv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** float missing facet count */ + public void testFloatMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "floatdv", "-1.3")); + assertU(adoc("id", "3", "floatdv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "floatdv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='floatdv']/int[@name='-1.3'][.=1]", + "//lst[@name='facet_fields']/lst[@name='floatdv']/int[@name='4.2'][.=1]", + "//lst[@name='facet_fields']/lst[@name='floatdv']/int[.=2]"); + } + + /** int with default lucene sort (treats as 0) */ + public void testIntSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "intdv", "-1")); + assertU(adoc("id", "2", "intdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "intdv asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "intdv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** int with sort missing always first */ + public void testIntSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "intdv_missingfirst", "-1")); + assertU(adoc("id", "2", "intdv_missingfirst", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "intdv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "intdv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** int with sort missing always last */ + public void testIntSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "intdv_missinglast", "-1")); + assertU(adoc("id", "2", "intdv_missinglast", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "intdv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "intdv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** int function query based on missing */ + public void testIntMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "intdv", "-1")); + assertU(adoc("id", "2", "intdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(intdv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** int missing facet count */ + public void testIntMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "intdv", "-1")); + assertU(adoc("id", "3", "intdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "intdv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='intdv']/int[@name='-1'][.=1]", + "//lst[@name='facet_fields']/lst[@name='intdv']/int[@name='4'][.=1]", + "//lst[@name='facet_fields']/lst[@name='intdv']/int[.=2]"); + } + + /** double with default lucene sort (treats as 0) */ + public void testDoubleSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "doubledv", "-1.3")); + assertU(adoc("id", "2", "doubledv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "doubledv asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "doubledv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** double with sort missing always first */ + public void testDoubleSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "doubledv_missingfirst", "-1.3")); + assertU(adoc("id", "2", "doubledv_missingfirst", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "doubledv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "doubledv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** double with sort missing always last */ + public void testDoubleSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "doubledv_missinglast", "-1.3")); + assertU(adoc("id", "2", "doubledv_missinglast", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "doubledv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "doubledv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** double function query based on missing */ + public void testDoubleMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "doubledv", "-1.3")); + assertU(adoc("id", "2", "doubledv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(doubledv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** double missing facet count */ + public void testDoubleMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "doubledv", "-1.3")); + assertU(adoc("id", "3", "doubledv", "4.2")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "doubledv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='doubledv']/int[@name='-1.3'][.=1]", + "//lst[@name='facet_fields']/lst[@name='doubledv']/int[@name='4.2'][.=1]", + "//lst[@name='facet_fields']/lst[@name='doubledv']/int[.=2]"); + } + + /** long with default lucene sort (treats as 0) */ + public void testLongSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "longdv", "-1")); + assertU(adoc("id", "2", "longdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "longdv asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "longdv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** long with sort missing always first */ + public void testLongSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "longdv_missingfirst", "-1")); + assertU(adoc("id", "2", "longdv_missingfirst", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "longdv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "longdv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** long with sort missing always last */ + public void testLongSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "longdv_missinglast", "-1")); + assertU(adoc("id", "2", "longdv_missinglast", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "longdv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "longdv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** long function query based on missing */ + public void testLongMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "longdv", "-1")); + assertU(adoc("id", "2", "longdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(longdv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** long missing facet count */ + public void testLongMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "longdv", "-1")); + assertU(adoc("id", "3", "longdv", "4")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "longdv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='longdv']/int[@name='-1'][.=1]", + "//lst[@name='facet_fields']/lst[@name='longdv']/int[@name='4'][.=1]", + "//lst[@name='facet_fields']/lst[@name='longdv']/int[.=2]"); + } + + /** date with default lucene sort (treats as 1970) */ + public void testDateSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "datedv", "1900-12-31T23:59:59.999Z")); + assertU(adoc("id", "2", "datedv", "2005-12-31T23:59:59.999Z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "datedv asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "datedv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=0]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** date with sort missing always first */ + public void testDateSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "datedv_missingfirst", "1900-12-31T23:59:59.999Z")); + assertU(adoc("id", "2", "datedv_missingfirst", "2005-12-31T23:59:59.999Z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "datedv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "datedv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** date with sort missing always last */ + public void testDateSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "datedv_missinglast", "1900-12-31T23:59:59.999Z")); + assertU(adoc("id", "2", "datedv_missinglast", "2005-12-31T23:59:59.999Z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "datedv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "datedv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** date function query based on missing */ + public void testDateMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "datedv", "1900-12-31T23:59:59.999Z")); + assertU(adoc("id", "2", "datedv", "2005-12-31T23:59:59.999Z")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(datedv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** date missing facet count */ + public void testDateMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "datedv", "1900-12-31T23:59:59.999Z")); + assertU(adoc("id", "3", "datedv", "2005-12-31T23:59:59.999Z")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "datedv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='datedv']/int[@name='1900-12-31T23:59:59.999Z'][.=1]", + "//lst[@name='facet_fields']/lst[@name='datedv']/int[@name='2005-12-31T23:59:59.999Z'][.=1]", + "//lst[@name='facet_fields']/lst[@name='datedv']/int[.=2]"); + } + + /** string with default lucene sort (treats as "") */ + public void testStringSort() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "stringdv", "a")); + assertU(adoc("id", "2", "stringdv", "z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "stringdv asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "stringdv desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** string with sort missing always first */ + public void testStringSortMissingFirst() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "stringdv_missingfirst", "a")); + assertU(adoc("id", "2", "stringdv_missingfirst", "z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "stringdv_missingfirst asc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=2]"); + assertQ(req("q", "*:*", "sort", "stringdv_missingfirst desc"), + "//result/doc[1]/str[@name='id'][.=0]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=1]"); + } + + /** string with sort missing always last */ + public void testStringSortMissingLast() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "stringdv_missinglast", "a")); + assertU(adoc("id", "2", "stringdv_missinglast", "z")); + assertU(commit()); + assertQ(req("q", "*:*", "sort", "stringdv_missinglast asc"), + "//result/doc[1]/str[@name='id'][.=1]", + "//result/doc[2]/str[@name='id'][.=2]", + "//result/doc[3]/str[@name='id'][.=0]"); + assertQ(req("q", "*:*", "sort", "stringdv_missinglast desc"), + "//result/doc[1]/str[@name='id'][.=2]", + "//result/doc[2]/str[@name='id'][.=1]", + "//result/doc[3]/str[@name='id'][.=0]"); + } + + /** string function query based on missing */ + public void testStringMissingFunction() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1", "stringdv", "a")); + assertU(adoc("id", "2", "stringdv", "z")); + assertU(commit()); + assertQ(req("q", "*:*", "fl", "e:exists(stringdv)", "sort", "id asc"), + "//result/doc[1]/bool[@name='e'][.='false']", + "//result/doc[2]/bool[@name='e'][.='true']", + "//result/doc[3]/bool[@name='e'][.='true']"); + } + + /** string missing facet count */ + public void testStringMissingFacet() throws Exception { + assertU(adoc("id", "0")); // missing + assertU(adoc("id", "1")); // missing + assertU(adoc("id", "2", "stringdv", "a")); + assertU(adoc("id", "3", "stringdv", "z")); + assertU(commit()); + assertQ(req("q", "*:*", "facet", "true", "facet.field", "stringdv", "facet.mincount", "1", "facet.missing", "true"), + "//lst[@name='facet_fields']/lst[@name='stringdv']/int[@name='a'][.=1]", + "//lst[@name='facet_fields']/lst[@name='stringdv']/int[@name='z'][.=1]", + "//lst[@name='facet_fields']/lst[@name='stringdv']/int[.=2]"); + } +}