From 4991491bda7afd2e05b4ff0b52922564dffb2eed Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 17 Feb 2013 01:16:53 +0000 Subject: [PATCH] file formats git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4765@1446988 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/codecs/lucene41/package.html | 3 ++- .../codecs/lucene42/Lucene42DocValuesFormat.java | 10 ++++++++-- .../org/apache/lucene/codecs/lucene42/package.html | 6 +++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/package.html b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/package.html index 6a8a5b1b979..3df02932d1b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/package.html +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/package.html @@ -375,7 +375,8 @@ can optionally be indexed into the postings lists. Payloads can be stored in the term vectors.
  • In version 4.1, the format of the postings list changed to use either of FOR compression or variable-byte encoding, depending upon the frequency -of the term.
  • +of the term. Terms appearing only once were changed to inline directly into +the term dictionary. Stored fields are compressed by default.

    Limitations

    diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java index 640218cd325..1cef4355dba 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java @@ -34,7 +34,7 @@ import org.apache.lucene.util.packed.BlockPackedWriter; /** * Lucene 4.2 DocValues format. *

    - * Encodes the three per-document value types (Numeric,Binary,Sorted) with five basic strategies. + * Encodes the four per-document value types (Numeric,Binary,Sorted,SortedSet) with seven basic strategies. *

    *

    *

    * Files: @@ -77,6 +79,8 @@ import org.apache.lucene.util.packed.BlockPackedWriter; * *

    Sorted fields have two entries: a SortedEntry with the FST metadata, * and an ordinary NumericEntry for the document-to-ord metadata.

    + *

    SortedSet fields have two entries: a SortedEntry with the FST metadata, + * and an ordinary BinaryEntry for the document-to-ord-list metadata.

    *

    FieldNumber of -1 indicates the end of metadata.

    *

    EntryType is a 0 (NumericEntry), 1 (BinaryEntry, or 2 (SortedEntry)

    *

    DataOffset is the pointer to the start of the data in the DocValues data (.dvd)

    @@ -107,6 +111,8 @@ import org.apache.lucene.util.packed.BlockPackedWriter; *
  • UncompressedNumerics --> {@link DataOutput#writeByte Byte}maxdoc
  • *
  • Addresses --> {@link MonotonicBlockPackedWriter MonotonicBlockPackedInts(blockSize=4096)}
  • * + *

    SortedSet entries store the list of ordinals in their BinaryData as a + * sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.

    * */ public final class Lucene42DocValuesFormat extends DocValuesFormat { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html index 6cca5e8de9f..9ed17df4483 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/package.html @@ -375,7 +375,11 @@ can optionally be indexed into the postings lists. Payloads can be stored in the term vectors.
  • In version 4.1, the format of the postings list changed to use either of FOR compression or variable-byte encoding, depending upon the frequency -of the term.
  • +of the term. Terms appearing only once were changed to inline directly into +the term dictionary. Stored fields are compressed by default. +
  • In version 4.2, term vectors are compressed by default. DocValues has +a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining +on multi-valued fields.
  • Limitations