mirror of https://github.com/apache/lucene.git
LUCENE-8585: Create jump-tables for DocValues at index-time
This commit is contained in:
parent
68ed797c94
commit
c13645bd4c
|
@ -243,6 +243,9 @@ Optimizations
|
||||||
* LUCENE-8607: MatchAllDocsQuery can shortcut when total hit count is not
|
* LUCENE-8607: MatchAllDocsQuery can shortcut when total hit count is not
|
||||||
required (Alan Woodward, Adrien Grand)
|
required (Alan Woodward, Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-8585: Index-time jump-tables for DocValues, for O(1) advance when retrieving doc values.
|
||||||
|
(Toke Eskildsen, Adrien Grand)
|
||||||
|
|
||||||
======================= Lucene 7.7.0 =======================
|
======================= Lucene 7.7.0 =======================
|
||||||
|
|
||||||
Changes in Runtime Behavior
|
Changes in Runtime Behavior
|
||||||
|
|
|
@ -13,3 +13,4 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Components from the Lucene 5.0 index format
|
* Components from the Lucene 5.0 index format
|
||||||
* See {@link org.apache.lucene.codecs.lucene50} for an overview
|
* See {@link org.apache.lucene.codecs.lucene80} for an overview
|
||||||
* of the index format.
|
* of the index format.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene50;
|
package org.apache.lucene.codecs.lucene50;
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene70}
|
* Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene80}
|
||||||
* for an overview of the current index format.
|
* for an overview of the current index format.
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene60;
|
package org.apache.lucene.codecs.lucene60;
|
||||||
|
|
|
@ -16,390 +16,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lucene 7.0 file format.
|
* Components from the Lucene 7.0 index format. See {@link org.apache.lucene.codecs.lucene80}
|
||||||
*
|
* for an overview of the current index format.
|
||||||
* <h1>Apache Lucene - Index File Formats</h1>
|
|
||||||
* <div>
|
|
||||||
* <ul>
|
|
||||||
* <li><a href="#Introduction">Introduction</a></li>
|
|
||||||
* <li><a href="#Definitions">Definitions</a>
|
|
||||||
* <ul>
|
|
||||||
* <li><a href="#Inverted_Indexing">Inverted Indexing</a></li>
|
|
||||||
* <li><a href="#Types_of_Fields">Types of Fields</a></li>
|
|
||||||
* <li><a href="#Segments">Segments</a></li>
|
|
||||||
* <li><a href="#Document_Numbers">Document Numbers</a></li>
|
|
||||||
* </ul>
|
|
||||||
* </li>
|
|
||||||
* <li><a href="#Overview">Index Structure Overview</a></li>
|
|
||||||
* <li><a href="#File_Naming">File Naming</a></li>
|
|
||||||
* <li><a href="#file-names">Summary of File Extensions</a>
|
|
||||||
* <ul>
|
|
||||||
* <li><a href="#Lock_File">Lock File</a></li>
|
|
||||||
* <li><a href="#History">History</a></li>
|
|
||||||
* <li><a href="#Limitations">Limitations</a></li>
|
|
||||||
* </ul>
|
|
||||||
* </li>
|
|
||||||
* </ul>
|
|
||||||
* </div>
|
|
||||||
* <a name="Introduction"></a>
|
|
||||||
* <h2>Introduction</h2>
|
|
||||||
* <div>
|
|
||||||
* <p>This document defines the index file formats used in this version of Lucene.
|
|
||||||
* If you are using a different version of Lucene, please consult the copy of
|
|
||||||
* <code>docs/</code> that was distributed with
|
|
||||||
* the version you are using.</p>
|
|
||||||
* <p>This document attempts to provide a high-level definition of the Apache
|
|
||||||
* Lucene file formats.</p>
|
|
||||||
* </div>
|
|
||||||
* <a name="Definitions"></a>
|
|
||||||
* <h2>Definitions</h2>
|
|
||||||
* <div>
|
|
||||||
* <p>The fundamental concepts in Lucene are index, document, field and term.</p>
|
|
||||||
* <p>An index contains a sequence of documents.</p>
|
|
||||||
* <ul>
|
|
||||||
* <li>A document is a sequence of fields.</li>
|
|
||||||
* <li>A field is a named sequence of terms.</li>
|
|
||||||
* <li>A term is a sequence of bytes.</li>
|
|
||||||
* </ul>
|
|
||||||
* <p>The same sequence of bytes in two different fields is considered a different
|
|
||||||
* term. Thus terms are represented as a pair: the string naming the field, and the
|
|
||||||
* bytes within the field.</p>
|
|
||||||
* <a name="Inverted_Indexing"></a>
|
|
||||||
* <h3>Inverted Indexing</h3>
|
|
||||||
* <p>The index stores statistics about terms in order to make term-based search
|
|
||||||
* more efficient. Lucene's index falls into the family of indexes known as an
|
|
||||||
* <i>inverted index.</i> This is because it can list, for a term, the documents
|
|
||||||
* that contain it. This is the inverse of the natural relationship, in which
|
|
||||||
* documents list terms.</p>
|
|
||||||
* <a name="Types_of_Fields"></a>
|
|
||||||
* <h3>Types of Fields</h3>
|
|
||||||
* <p>In Lucene, fields may be <i>stored</i>, in which case their text is stored
|
|
||||||
* in the index literally, in a non-inverted manner. Fields that are inverted are
|
|
||||||
* called <i>indexed</i>. A field may be both stored and indexed.</p>
|
|
||||||
* <p>The text of a field may be <i>tokenized</i> into terms to be indexed, or the
|
|
||||||
* text of a field may be used literally as a term to be indexed. Most fields are
|
|
||||||
* tokenized, but sometimes it is useful for certain identifier fields to be
|
|
||||||
* indexed literally.</p>
|
|
||||||
* <p>See the {@link org.apache.lucene.document.Field Field}
|
|
||||||
* java docs for more information on Fields.</p>
|
|
||||||
* <a name="Segments"></a>
|
|
||||||
* <h3>Segments</h3>
|
|
||||||
* <p>Lucene indexes may be composed of multiple sub-indexes, or <i>segments</i>.
|
|
||||||
* Each segment is a fully independent index, which could be searched separately.
|
|
||||||
* Indexes evolve by:</p>
|
|
||||||
* <ol>
|
|
||||||
* <li>Creating new segments for newly added documents.</li>
|
|
||||||
* <li>Merging existing segments.</li>
|
|
||||||
* </ol>
|
|
||||||
* <p>Searches may involve multiple segments and/or multiple indexes, each index
|
|
||||||
* potentially composed of a set of segments.</p>
|
|
||||||
* <a name="Document_Numbers"></a>
|
|
||||||
* <h3>Document Numbers</h3>
|
|
||||||
* <p>Internally, Lucene refers to documents by an integer <i>document number</i>.
|
|
||||||
* The first document added to an index is numbered zero, and each subsequent
|
|
||||||
* document added gets a number one greater than the previous.</p>
|
|
||||||
* <p>Note that a document's number may change, so caution should be taken when
|
|
||||||
* storing these numbers outside of Lucene. In particular, numbers may change in
|
|
||||||
* the following situations:</p>
|
|
||||||
* <ul>
|
|
||||||
* <li>
|
|
||||||
* <p>The numbers stored in each segment are unique only within the segment, and
|
|
||||||
* must be converted before they can be used in a larger context. The standard
|
|
||||||
* technique is to allocate each segment a range of values, based on the range of
|
|
||||||
* numbers used in that segment. To convert a document number from a segment to an
|
|
||||||
* external value, the segment's <i>base</i> document number is added. To convert
|
|
||||||
* an external value back to a segment-specific value, the segment is identified
|
|
||||||
* by the range that the external value is in, and the segment's base value is
|
|
||||||
* subtracted. For example two five document segments might be combined, so that
|
|
||||||
* the first segment has a base value of zero, and the second of five. Document
|
|
||||||
* three from the second segment would have an external value of eight.</p>
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* <p>When documents are deleted, gaps are created in the numbering. These are
|
|
||||||
* eventually removed as the index evolves through merging. Deleted documents are
|
|
||||||
* dropped when segments are merged. A freshly-merged segment thus has no gaps in
|
|
||||||
* its numbering.</p>
|
|
||||||
* </li>
|
|
||||||
* </ul>
|
|
||||||
* </div>
|
|
||||||
* <a name="Overview"></a>
|
|
||||||
* <h2>Index Structure Overview</h2>
|
|
||||||
* <div>
|
|
||||||
* <p>Each segment index maintains the following:</p>
|
|
||||||
* <ul>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene70.Lucene70SegmentInfoFormat Segment info}.
|
|
||||||
* This contains metadata about a segment, such as the number of documents,
|
|
||||||
* what files it uses,
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Field names}.
|
|
||||||
* This contains the set of field names used in the index.
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Stored Field values}.
|
|
||||||
* This contains, for each document, a list of attribute-value pairs, where the attributes
|
|
||||||
* are field names. These are used to store auxiliary information about the document, such as
|
|
||||||
* its title, url, or an identifier to access a database. The set of stored fields are what is
|
|
||||||
* returned for each hit when searching. This is keyed by document number.
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term dictionary}.
|
|
||||||
* A dictionary containing all of the terms used in all of the
|
|
||||||
* indexed fields of all of the documents. The dictionary also contains the number
|
|
||||||
* of documents which contain the term, and pointers to the term's frequency and
|
|
||||||
* proximity data.
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Frequency data}.
|
|
||||||
* For each term in the dictionary, the numbers of all the
|
|
||||||
* documents that contain that term, and the frequency of the term in that
|
|
||||||
* document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Proximity data}.
|
|
||||||
* For each term in the dictionary, the positions that the
|
|
||||||
* term occurs in each document. Note that this will not exist if all fields in
|
|
||||||
* all documents omit position data.
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Normalization factors}.
|
|
||||||
* For each field in each document, a value is stored
|
|
||||||
* that is multiplied into the score for hits on that field.
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
|
|
||||||
* For each field in each document, the term vector (sometimes
|
|
||||||
* called document vector) may be stored. A term vector consists of term text and
|
|
||||||
* term frequency. To add Term Vectors to your index see the
|
|
||||||
* {@link org.apache.lucene.document.Field Field} constructors
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-document values}.
|
|
||||||
* Like stored values, these are also keyed by document
|
|
||||||
* number, but are generally intended to be loaded into main memory for fast
|
|
||||||
* access. Whereas stored values are generally intended for summary results from
|
|
||||||
* searches, per-document values are useful for things like scoring factors.
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
|
|
||||||
* An optional file indicating which documents are live.
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* {@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}.
|
|
||||||
* Optional pair of files, recording dimensionally indexed fields, to enable fast
|
|
||||||
* numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
|
|
||||||
* and geographic shape intersection (2D, 3D).
|
|
||||||
* </li>
|
|
||||||
* </ul>
|
|
||||||
* <p>Details on each of these are provided in their linked pages.</p>
|
|
||||||
* </div>
|
|
||||||
* <a name="File_Naming"></a>
|
|
||||||
* <h2>File Naming</h2>
|
|
||||||
* <div>
|
|
||||||
* <p>All files belonging to a segment have the same name with varying extensions.
|
|
||||||
* The extensions correspond to the different file formats described below. When
|
|
||||||
* using the Compound File format (default for small segments) these files (except
|
|
||||||
* for the Segment info file, the Lock file, and Deleted documents file) are collapsed
|
|
||||||
* into a single .cfs file (see below for details)</p>
|
|
||||||
* <p>Typically, all segments in an index are stored in a single directory,
|
|
||||||
* although this is not required.</p>
|
|
||||||
* <p>File names are never re-used. That is, when any file is saved
|
|
||||||
* to the Directory it is given a never before used filename. This is achieved
|
|
||||||
* using a simple generations approach. For example, the first segments file is
|
|
||||||
* segments_1, then segments_2, etc. The generation is a sequential long integer
|
|
||||||
* represented in alpha-numeric (base 36) form.</p>
|
|
||||||
* </div>
|
|
||||||
* <a name="file-names"></a>
|
|
||||||
* <h2>Summary of File Extensions</h2>
|
|
||||||
* <div>
|
|
||||||
* <p>The following table summarizes the names and extensions of the files in
|
|
||||||
* Lucene:</p>
|
|
||||||
* <table cellspacing="1" cellpadding="4" summary="lucene filenames by extension">
|
|
||||||
* <tr>
|
|
||||||
* <th>Name</th>
|
|
||||||
* <th>Extension</th>
|
|
||||||
* <th>Brief Description</th>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.index.SegmentInfos Segments File}</td>
|
|
||||||
* <td>segments_N</td>
|
|
||||||
* <td>Stores information about a commit point</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td><a href="#Lock_File">Lock File</a></td>
|
|
||||||
* <td>write.lock</td>
|
|
||||||
* <td>The Write lock prevents multiple IndexWriters from writing to the same
|
|
||||||
* file.</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene70.Lucene70SegmentInfoFormat Segment Info}</td>
|
|
||||||
* <td>.si</td>
|
|
||||||
* <td>Stores metadata about a segment</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}</td>
|
|
||||||
* <td>.cfs, .cfe</td>
|
|
||||||
* <td>An optional "virtual" file consisting of all the other index files for
|
|
||||||
* systems that frequently run out of file handles.</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50FieldInfosFormat Fields}</td>
|
|
||||||
* <td>.fnm</td>
|
|
||||||
* <td>Stores information about the fields</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Index}</td>
|
|
||||||
* <td>.fdx</td>
|
|
||||||
* <td>Contains pointers to field data</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat Field Data}</td>
|
|
||||||
* <td>.fdt</td>
|
|
||||||
* <td>The stored fields for documents</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Dictionary}</td>
|
|
||||||
* <td>.tim</td>
|
|
||||||
* <td>The term dictionary, stores term info</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Term Index}</td>
|
|
||||||
* <td>.tip</td>
|
|
||||||
* <td>The index into the Term Dictionary</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Frequencies}</td>
|
|
||||||
* <td>.doc</td>
|
|
||||||
* <td>Contains the list of docs which contain each term along with frequency</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Positions}</td>
|
|
||||||
* <td>.pos</td>
|
|
||||||
* <td>Stores position information about where a term occurs in the index</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat Payloads}</td>
|
|
||||||
* <td>.pay</td>
|
|
||||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms}</td>
|
|
||||||
* <td>.nvd, .nvm</td>
|
|
||||||
* <td>Encodes length and boost factors for docs and fields</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-Document Values}</td>
|
|
||||||
* <td>.dvd, .dvm</td>
|
|
||||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}</td>
|
|
||||||
* <td>.tvx</td>
|
|
||||||
* <td>Stores offset into the document data file</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}</td>
|
|
||||||
* <td>.tvd</td>
|
|
||||||
* <td>Contains term vector data.</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
|
|
||||||
* <td>.liv</td>
|
|
||||||
* <td>Info about what documents are live</td>
|
|
||||||
* </tr>
|
|
||||||
* <tr>
|
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}</td>
|
|
||||||
* <td>.dii, .dim</td>
|
|
||||||
* <td>Holds indexed points, if any</td>
|
|
||||||
* </tr>
|
|
||||||
* </table>
|
|
||||||
* </div>
|
|
||||||
* <a name="Lock_File"></a>
|
|
||||||
* <h2>Lock File</h2>
|
|
||||||
* The write lock, which is stored in the index directory by default, is named
|
|
||||||
* "write.lock". If the lock directory is different from the index directory then
|
|
||||||
* the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
|
|
||||||
* derived from the full path to the index directory. When this file is present, a
|
|
||||||
* writer is currently modifying the index (adding or removing documents). This
|
|
||||||
* lock file ensures that only one writer is modifying the index at a time.
|
|
||||||
* <a name="History"></a>
|
|
||||||
* <h2>History</h2>
|
|
||||||
* <p>Compatibility notes are provided in this document, describing how file
|
|
||||||
* formats have changed from prior versions:</p>
|
|
||||||
* <ul>
|
|
||||||
* <li>In version 2.1, the file format was changed to allow lock-less commits (ie,
|
|
||||||
* no more commit lock). The change is fully backwards compatible: you can open a
|
|
||||||
* pre-2.1 index for searching or adding/deleting of docs. When the new segments
|
|
||||||
* file is saved (committed), it will be written in the new file format (meaning
|
|
||||||
* no specific "upgrade" process is needed). But note that once a commit has
|
|
||||||
* occurred, pre-2.1 Lucene will not be able to read the index.</li>
|
|
||||||
* <li>In version 2.3, the file format was changed to allow segments to share a
|
|
||||||
* single set of doc store (vectors & stored fields) files. This allows for
|
|
||||||
* faster indexing in certain cases. The change is fully backwards compatible (in
|
|
||||||
* the same way as the lock-less commits change in 2.1).</li>
|
|
||||||
* <li>In version 2.4, Strings are now written as true UTF-8 byte sequence, not
|
|
||||||
* Java's modified UTF-8. See <a href="http://issues.apache.org/jira/browse/LUCENE-510">
|
|
||||||
* LUCENE-510</a> for details.</li>
|
|
||||||
* <li>In version 2.9, an optional opaque Map<String,String> CommitUserData
|
|
||||||
* may be passed to IndexWriter's commit methods (and later retrieved), which is
|
|
||||||
* recorded in the segments_N file. See <a href="http://issues.apache.org/jira/browse/LUCENE-1382">
|
|
||||||
* LUCENE-1382</a> for details. Also,
|
|
||||||
* diagnostics were added to each segment written recording details about why it
|
|
||||||
* was written (due to flush, merge; which OS/JRE was used; etc.). See issue
|
|
||||||
* <a href="http://issues.apache.org/jira/browse/LUCENE-1654">LUCENE-1654</a> for details.</li>
|
|
||||||
* <li>In version 3.0, compressed fields are no longer written to the index (they
|
|
||||||
* can still be read, but on merge the new segment will write them, uncompressed).
|
|
||||||
* See issue <a href="http://issues.apache.org/jira/browse/LUCENE-1960">LUCENE-1960</a>
|
|
||||||
* for details.</li>
|
|
||||||
* <li>In version 3.1, segments records the code version that created them. See
|
|
||||||
* <a href="http://issues.apache.org/jira/browse/LUCENE-2720">LUCENE-2720</a> for details.
|
|
||||||
* Additionally segments track explicitly whether or not they have term vectors.
|
|
||||||
* See <a href="http://issues.apache.org/jira/browse/LUCENE-2811">LUCENE-2811</a>
|
|
||||||
* for details.</li>
|
|
||||||
* <li>In version 3.2, numeric fields are written as natively to stored fields
|
|
||||||
* file, previously they were stored in text format only.</li>
|
|
||||||
* <li>In version 3.4, fields can omit position data while still indexing term
|
|
||||||
* frequencies.</li>
|
|
||||||
* <li>In version 4.0, the format of the inverted index became extensible via
|
|
||||||
* the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
|
|
||||||
* ({@code DocValues}) was introduced. Normalization factors need no longer be a
|
|
||||||
* single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
|
|
||||||
* Terms need not be unicode strings, they can be any byte sequence. Term offsets
|
|
||||||
* can optionally be indexed into the postings lists. Payloads can be stored in the
|
|
||||||
* term vectors.</li>
|
|
||||||
* <li>In version 4.1, the format of the postings list changed to use either
|
|
||||||
* of FOR compression or variable-byte encoding, depending upon the frequency
|
|
||||||
* of the term. Terms appearing only once were changed to inline directly into
|
|
||||||
* the term dictionary. Stored fields are compressed by default. </li>
|
|
||||||
* <li>In version 4.2, term vectors are compressed by default. DocValues has
|
|
||||||
* a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
|
|
||||||
* on multi-valued fields.</li>
|
|
||||||
* <li>In version 4.5, DocValues were extended to explicitly represent missing values.</li>
|
|
||||||
* <li>In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
|
|
||||||
* allow updating NumericDocValues fields.</li>
|
|
||||||
* <li>In version 4.8, checksum footers were added to the end of each index file
|
|
||||||
* for improved data integrity. Specifically, the last 8 bytes of every index file
|
|
||||||
* contain the zlib-crc32 checksum of the file.</li>
|
|
||||||
* <li>In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
|
|
||||||
* that is suitable for faceting/sorting/analytics.
|
|
||||||
* <li>In version 5.4, DocValues have been improved to store more information on disk:
|
|
||||||
* addresses for binary fields and ord indexes for multi-valued fields.
|
|
||||||
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
|
|
||||||
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
|
|
||||||
* <li>In version 7.0, DocValues have been improved to better support sparse doc values
|
|
||||||
* thanks to an iterator API.
|
|
||||||
* </li>
|
|
||||||
* </ul>
|
|
||||||
* <a name="Limitations"></a>
|
|
||||||
* <h2>Limitations</h2>
|
|
||||||
* <div>
|
|
||||||
* <p>Lucene uses a Java <code>int</code> to refer to
|
|
||||||
* document numbers, and the index file format uses an <code>Int32</code>
|
|
||||||
* on-disk to store document numbers. This is a limitation
|
|
||||||
* of both the index file format and the current implementation. Eventually these
|
|
||||||
* should be replaced with either <code>UInt64</code> values, or
|
|
||||||
* better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.</p>
|
|
||||||
* </div>
|
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.codecs.lucene70;
|
package org.apache.lucene.codecs.lucene70;
|
||||||
|
|
|
@ -0,0 +1,632 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene80;
|
||||||
|
|
||||||
|
import java.io.DataInput;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.store.RandomAccessInput;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BitSetIterator;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.RoaringDocIdSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Disk-based implementation of a {@link DocIdSetIterator} which can return
|
||||||
|
* the index of the current document, i.e. the ordinal of the current document
|
||||||
|
* among the list of documents that this iterator can return. This is useful
|
||||||
|
* to implement sparse doc values by only having to encode values for documents
|
||||||
|
* that actually have a value.
|
||||||
|
* <p>Implementation-wise, this {@link DocIdSetIterator} is inspired of
|
||||||
|
* {@link RoaringDocIdSet roaring bitmaps} and encodes ranges of {@code 65536}
|
||||||
|
* documents independently and picks between 3 encodings depending on the
|
||||||
|
* density of the range:<ul>
|
||||||
|
* <li>{@code ALL} if the range contains 65536 documents exactly,
|
||||||
|
* <li>{@code DENSE} if the range contains 4096 documents or more; in that
|
||||||
|
* case documents are stored in a bit set,
|
||||||
|
* <li>{@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are
|
||||||
|
* stored in a {@link DataInput#readShort() short}.
|
||||||
|
* </ul>
|
||||||
|
* <p>Only ranges that contain at least one value are encoded.
|
||||||
|
* <p>This implementation uses 6 bytes per document in the worst-case, which happens
|
||||||
|
* in the case that all ranges contain exactly one document.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* To avoid O(n) lookup time complexity, with n being the number of documents, two lookup
|
||||||
|
* tables are used: A lookup table for block offset and index, and a rank structure
|
||||||
|
* for DENSE block index lookups.
|
||||||
|
*
|
||||||
|
* The lookup table is an array of {@code int}-pairs, with a pair for each block. It allows for
|
||||||
|
* direct jumping to the block, as opposed to iteration from the current position and forward
|
||||||
|
* one block at a time.
|
||||||
|
*
|
||||||
|
* Each int-pair entry consists of 2 logical parts:
|
||||||
|
*
|
||||||
|
* The first 32 bit int holds the index (number of set bits in the blocks) up to just before the
|
||||||
|
* wanted block. The maximum number of set bits is the maximum number of documents, which is < 2^31.
|
||||||
|
*
|
||||||
|
* The next int holds the offset in bytes into the underlying slice. As there is a maximum of 2^16
|
||||||
|
* blocks, it follows that the maximum size of any block must not exceed 2^15 bytes to avoid
|
||||||
|
* overflow (2^16 bytes if the int is treated as unsigned). This is currently the case, with the
|
||||||
|
* largest block being DENSE and using 2^13 + 36 bytes.
|
||||||
|
*
|
||||||
|
* The cache overhead is numDocs/1024 bytes.
|
||||||
|
*
|
||||||
|
* Note: There are 4 types of blocks: ALL, DENSE, SPARSE and non-existing (0 set bits).
|
||||||
|
* In the case of non-existing blocks, the entry in the lookup table has index equal to the
|
||||||
|
* previous entry and offset equal to the next non-empty block.
|
||||||
|
*
|
||||||
|
* The block lookup table is stored at the end of the total block structure.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* The rank structure for DENSE blocks is an array of byte-pairs with an entry for each
|
||||||
|
* sub-block (default 512 bits) out of the 65536 bits in the outer DENSE block.
|
||||||
|
*
|
||||||
|
* Each rank-entry states the number of set bits within the block up to the bit before the
|
||||||
|
* bit positioned at the start of the sub-block.
|
||||||
|
* Note that that the rank entry of the first sub-block is always 0 and that the last entry can
|
||||||
|
* at most be 65536-2 = 65634 and thus will always fit into an byte-pair of 16 bits.
|
||||||
|
*
|
||||||
|
* The rank structure for a given DENSE block is stored at the beginning of the DENSE block.
|
||||||
|
* This ensures locality and keeps logistics simple.
|
||||||
|
*
|
||||||
|
* @lucene.internal
|
||||||
|
*/
|
||||||
|
final class IndexedDISI extends DocIdSetIterator {
|
||||||
|
|
||||||
|
// jump-table time/space trade-offs to consider:
|
||||||
|
// The block offsets and the block indexes could be stored in more compressed form with
|
||||||
|
// two PackedInts or two MonotonicDirectReaders.
|
||||||
|
// The DENSE ranks (default 128 shorts = 256 bytes) could likewise be compressed. But as there is
|
||||||
|
// at least 4096 set bits in DENSE blocks, there will be at least one rank with 2^12 bits, so it
|
||||||
|
// is doubtful if there is much to gain here.
|
||||||
|
|
||||||
|
private static final int BLOCK_SIZE = 65536; // The number of docIDs that a single block represents
|
||||||
|
|
||||||
|
private static final int DENSE_BLOCK_LONGS = BLOCK_SIZE/Long.SIZE; // 1024
|
||||||
|
public static final byte DEFAULT_DENSE_RANK_POWER = 9; // Every 512 docIDs / 8 longs
|
||||||
|
|
||||||
|
static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;
|
||||||
|
|
||||||
|
private static void flush(
|
||||||
|
int block, FixedBitSet buffer, int cardinality, byte denseRankPower, IndexOutput out) throws IOException {
|
||||||
|
assert block >= 0 && block < 65536;
|
||||||
|
out.writeShort((short) block);
|
||||||
|
assert cardinality > 0 && cardinality <= 65536;
|
||||||
|
out.writeShort((short) (cardinality - 1));
|
||||||
|
if (cardinality > MAX_ARRAY_LENGTH) {
|
||||||
|
if (cardinality != 65536) { // all docs are set
|
||||||
|
if (denseRankPower != -1) {
|
||||||
|
final byte[] rank = createRank(buffer, denseRankPower);
|
||||||
|
out.writeBytes(rank, rank.length);
|
||||||
|
}
|
||||||
|
for (long word : buffer.getBits()) {
|
||||||
|
out.writeLong(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
BitSetIterator it = new BitSetIterator(buffer, cardinality);
|
||||||
|
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||||
|
out.writeShort((short) doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Creates a DENSE rank-entry (the number of set bits up to a given point) for the buffer.
|
||||||
|
// One rank-entry for every {@code 2^denseRankPower} bits, with each rank-entry using 2 bytes.
|
||||||
|
// Represented as a byte[] for fast flushing and mirroring of the retrieval representation.
|
||||||
|
private static byte[] createRank(FixedBitSet buffer, byte denseRankPower) {
|
||||||
|
final int longsPerRank = 1 << (denseRankPower-6);
|
||||||
|
final int rankMark = longsPerRank-1;
|
||||||
|
final int rankIndexShift = denseRankPower-7; // 6 for the long (2^6) + 1 for 2 bytes/entry
|
||||||
|
final byte[] rank = new byte[DENSE_BLOCK_LONGS >> rankIndexShift];
|
||||||
|
final long[] bits = buffer.getBits();
|
||||||
|
int bitCount = 0;
|
||||||
|
for (int word = 0 ; word < DENSE_BLOCK_LONGS ; word++) {
|
||||||
|
if ((word & rankMark) == 0) { // Every longsPerRank longs
|
||||||
|
rank[word >> rankIndexShift] = (byte)(bitCount>>8);
|
||||||
|
rank[(word >> rankIndexShift)+1] = (byte)(bitCount & 0xFF);
|
||||||
|
}
|
||||||
|
bitCount += Long.bitCount(bits[word]);
|
||||||
|
}
|
||||||
|
return rank;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the docIDs from it to out, in logical blocks, one for each 65536 docIDs in monotonically increasing
|
||||||
|
* gap-less order. DENSE blocks uses {@link #DEFAULT_DENSE_RANK_POWER} of 9 (every 512 docIDs / 8 longs).
|
||||||
|
* The caller must keep track of the number of jump-table entries (returned by this method) as well as the
|
||||||
|
* denseRankPower (9 for this method) and provide them when constructing an IndexedDISI for reading.
|
||||||
|
* @param it the document IDs.
|
||||||
|
* @param out destination for the blocks.
|
||||||
|
* @throws IOException if there was an error writing to out.
|
||||||
|
* @return the number of jump-table entries following the blocks, -1 for no entries.
|
||||||
|
* This should be stored in meta and used when creating an instance of IndexedDISI.
|
||||||
|
*/
|
||||||
|
static short writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException {
|
||||||
|
return writeBitSet(it, out, DEFAULT_DENSE_RANK_POWER);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the docIDs from it to out, in logical blocks, one for each 65536 docIDs in monotonically
|
||||||
|
* increasing gap-less order.
|
||||||
|
* The caller must keep track of the number of jump-table entries (returned by this method) as well as the
|
||||||
|
* denseRankPower and provide them when constructing an IndexedDISI for reading.
|
||||||
|
* @param it the document IDs.
|
||||||
|
* @param out destination for the blocks.
|
||||||
|
* @param denseRankPower for {@link Method#DENSE} blocks, a rank will be written every {@code 2^denseRankPower} docIDs.
|
||||||
|
* Values < 7 (every 128 docIDs) or > 15 (every 32768 docIDs) disables DENSE rank.
|
||||||
|
* Recommended values are 8-12: Every 256-4096 docIDs or 4-64 longs.
|
||||||
|
* {@link #DEFAULT_DENSE_RANK_POWER} is 9: Every 512 docIDs.
|
||||||
|
* This should be stored in meta and used when creating an instance of IndexedDISI.
|
||||||
|
* @throws IOException if there was an error writing to out.
|
||||||
|
* @return the number of jump-table entries following the blocks, -1 for no entries.
|
||||||
|
* This should be stored in meta and used when creating an instance of IndexedDISI.
|
||||||
|
*/
|
||||||
|
static short writeBitSet(DocIdSetIterator it, IndexOutput out, byte denseRankPower) throws IOException {
|
||||||
|
final long origo = out.getFilePointer(); // All jumps are relative to the origo
|
||||||
|
if ((denseRankPower < 7 || denseRankPower > 15) && denseRankPower != -1) {
|
||||||
|
throw new IllegalArgumentException("Acceptable values for denseRankPower are 7-15 (every 128-32768 docIDs). " +
|
||||||
|
"The provided power was " + denseRankPower + " (every " + (int)Math.pow(2, denseRankPower) + " docIDs)");
|
||||||
|
}
|
||||||
|
int totalCardinality = 0;
|
||||||
|
int blockCardinality = 0;
|
||||||
|
final FixedBitSet buffer = new FixedBitSet(1<<16);
|
||||||
|
int[] jumps = new int[ArrayUtil.oversize(1, Integer.BYTES*2)];
|
||||||
|
int prevBlock = -1;
|
||||||
|
int jumpBlockIndex = 0;
|
||||||
|
|
||||||
|
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||||
|
final int block = doc >>> 16;
|
||||||
|
if (prevBlock != -1 && block != prevBlock) {
|
||||||
|
// Track offset+index from previous block up to current
|
||||||
|
jumps = addJumps(jumps, out.getFilePointer()-origo, totalCardinality, jumpBlockIndex, prevBlock+1);
|
||||||
|
jumpBlockIndex = prevBlock+1;
|
||||||
|
// Flush block
|
||||||
|
flush(prevBlock, buffer, blockCardinality, denseRankPower, out);
|
||||||
|
// Reset for next block
|
||||||
|
buffer.clear(0, buffer.length());
|
||||||
|
totalCardinality += blockCardinality;
|
||||||
|
blockCardinality = 0;
|
||||||
|
}
|
||||||
|
buffer.set(doc & 0xFFFF);
|
||||||
|
blockCardinality++;
|
||||||
|
prevBlock = block;
|
||||||
|
}
|
||||||
|
if (blockCardinality > 0) {
|
||||||
|
jumps = addJumps(jumps, out.getFilePointer()-origo, totalCardinality, jumpBlockIndex, prevBlock+1);
|
||||||
|
totalCardinality += blockCardinality;
|
||||||
|
flush(prevBlock, buffer, blockCardinality, denseRankPower, out);
|
||||||
|
buffer.clear(0, buffer.length());
|
||||||
|
prevBlock++;
|
||||||
|
}
|
||||||
|
final int lastBlock = prevBlock == -1 ? 0 : prevBlock; // There will always be at least 1 block (NO_MORE_DOCS)
|
||||||
|
// Last entry is a SPARSE with blockIndex == 32767 and the single entry 65535, which becomes the docID NO_MORE_DOCS
|
||||||
|
// To avoid creating 65K jump-table entries, only a single entry is created pointing to the offset of the
|
||||||
|
// NO_MORE_DOCS block, with the jumpBlockIndex set to the logical EMPTY block after all real blocks.
|
||||||
|
jumps = addJumps(jumps, out.getFilePointer()-origo, totalCardinality, lastBlock, lastBlock+1);
|
||||||
|
buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
|
||||||
|
flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, denseRankPower, out);
|
||||||
|
// offset+index jump-table stored at the end
|
||||||
|
return flushBlockJumps(jumps, lastBlock+1, out, origo);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Adds entries to the offset & index jump-table for blocks
|
||||||
|
private static int[] addJumps(int[] jumps, long offset, int index, int startBlock, int endBlock) {
|
||||||
|
assert offset < Integer.MAX_VALUE : "Logically the offset should not exceed 2^30 but was >= Integer.MAX_VALUE";
|
||||||
|
jumps = ArrayUtil.grow(jumps, (endBlock+1)*2);
|
||||||
|
for (int b = startBlock; b < endBlock; b++) {
|
||||||
|
jumps[b*2] = index;
|
||||||
|
jumps[b*2+1] = (int) offset;
|
||||||
|
}
|
||||||
|
return jumps;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flushes the offet & index jump-table for blocks. This should be the last data written to out
|
||||||
|
// This method returns the blockCount for the blocks reachable for the jump_table or -1 for no jump-table
|
||||||
|
private static short flushBlockJumps(int[] jumps, int blockCount, IndexOutput out, long origo) throws IOException {
|
||||||
|
if (blockCount == 2) { // Jumps with a single real entry + NO_MORE_DOCS is just wasted space so we ignore that
|
||||||
|
blockCount = 0;
|
||||||
|
}
|
||||||
|
for (int i = 0 ; i < blockCount ; i++) {
|
||||||
|
out.writeInt(jumps[i*2]); // index
|
||||||
|
out.writeInt(jumps[i*2+1]); // offset
|
||||||
|
}
|
||||||
|
// As there are at most 32k blocks, the count is a short
|
||||||
|
// The jumpTableOffset will be at lastPos - (blockCount * Long.BYTES)
|
||||||
|
return (short)blockCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The slice that stores the {@link DocIdSetIterator}. */
|
||||||
|
private final IndexInput slice;
|
||||||
|
private final int jumpTableEntryCount;
|
||||||
|
private final byte denseRankPower;
|
||||||
|
private final RandomAccessInput jumpTable; // Skip blocks of 64K bits
|
||||||
|
private final byte[] denseRankTable;
|
||||||
|
private final long cost;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This constructor always creates a new blockSlice and a new jumpTable from in, to ensure that operations are
|
||||||
|
* independent from the caller.
|
||||||
|
* See {@link #IndexedDISI(IndexInput, RandomAccessInput, int, byte, long)} for re-use of blockSlice and jumpTable.
|
||||||
|
* @param in backing data.
|
||||||
|
* @param offset starting offset for blocks in the backing data.
|
||||||
|
* @param length the number of bytes holding blocks and jump-table in the backing data.
|
||||||
|
* @param jumpTableEntryCount the number of blocks covered by the jump-table.
|
||||||
|
* This must match the number returned by {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)}.
|
||||||
|
* @param denseRankPower the number of docIDs covered by each rank entry in DENSE blocks, expressed as {@code 2^denseRankPower}.
|
||||||
|
* This must match the power given in {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)}
|
||||||
|
* @param cost normally the number of logical docIDs.
|
||||||
|
*/
|
||||||
|
IndexedDISI(IndexInput in, long offset, long length, int jumpTableEntryCount, byte denseRankPower, long cost) throws IOException {
|
||||||
|
this(createBlockSlice(in,"docs", offset, length, jumpTableEntryCount),
|
||||||
|
createJumpTable(in, offset, length, jumpTableEntryCount),
|
||||||
|
jumpTableEntryCount, denseRankPower, cost);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This constructor allows to pass the slice and jumpTable directly in case it helps reuse.
|
||||||
|
* see eg. Lucene80 norms producer's merge instance.
|
||||||
|
* @param blockSlice data blocks, normally created by {@link #createBlockSlice}.
|
||||||
|
* @param jumpTable table holding jump-data for block-skips, normally created by {@link #createJumpTable}.
|
||||||
|
* @param jumpTableEntryCount the number of blocks covered by the jump-table.
|
||||||
|
* This must match the number returned by {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)}.
|
||||||
|
* @param denseRankPower the number of docIDs covered by each rank entry in DENSE blocks, expressed as {@code 2^denseRankPower}.
|
||||||
|
* This must match the power given in {@link #writeBitSet(DocIdSetIterator, IndexOutput, byte)}
|
||||||
|
* @param cost normally the number of logical docIDs.
|
||||||
|
*/
|
||||||
|
IndexedDISI(IndexInput blockSlice, RandomAccessInput jumpTable, int jumpTableEntryCount, byte denseRankPower, long cost) throws IOException {
|
||||||
|
if ((denseRankPower < 7 || denseRankPower > 15) && denseRankPower != -1) {
|
||||||
|
throw new IllegalArgumentException("Acceptable values for denseRankPower are 7-15 (every 128-32768 docIDs). " +
|
||||||
|
"The provided power was " + denseRankPower + " (every " + (int)Math.pow(2, denseRankPower) + " docIDs). ");
|
||||||
|
}
|
||||||
|
|
||||||
|
this.slice = blockSlice;
|
||||||
|
this.jumpTable = jumpTable;
|
||||||
|
this.jumpTableEntryCount = jumpTableEntryCount;
|
||||||
|
this.denseRankPower = denseRankPower;
|
||||||
|
final int rankIndexShift = denseRankPower-7;
|
||||||
|
this.denseRankTable = denseRankPower == -1 ? null : new byte[DENSE_BLOCK_LONGS >> rankIndexShift];
|
||||||
|
this.cost = cost;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method for using {@link #IndexedDISI(IndexInput, RandomAccessInput, int, byte, long)}.
|
||||||
|
* Creates a disiSlice for the IndexedDISI data blocks, without the jump-table.
|
||||||
|
* @param slice backing data, holding both blocks and jump-table.
|
||||||
|
* @param sliceDescription human readable slice designation.
|
||||||
|
* @param offset relative to the backing data.
|
||||||
|
* @param length full length of the IndexedDISI, including blocks and jump-table data.
|
||||||
|
* @param jumpTableEntryCount the number of blocks covered by the jump-table.
|
||||||
|
* @return a jumpTable containing the block jump-data or null if no such table exists.
|
||||||
|
* @throws IOException if a RandomAccessInput could not be created from slice.
|
||||||
|
*/
|
||||||
|
public static IndexInput createBlockSlice(
|
||||||
|
IndexInput slice, String sliceDescription, long offset, long length, int jumpTableEntryCount) throws IOException {
|
||||||
|
long jumpTableBytes = jumpTableEntryCount < 0 ? 0 : jumpTableEntryCount*Integer.BYTES*2;
|
||||||
|
return slice.slice(sliceDescription, offset, length - jumpTableBytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method for using {@link #IndexedDISI(IndexInput, RandomAccessInput, int, byte, long)}.
|
||||||
|
* Creates a RandomAccessInput covering only the jump-table data or null.
|
||||||
|
* @param slice backing data, holding both blocks and jump-table.
|
||||||
|
* @param offset relative to the backing data.
|
||||||
|
* @param length full length of the IndexedDISI, including blocks and jump-table data.
|
||||||
|
* @param jumpTableEntryCount the number of blocks covered by the jump-table.
|
||||||
|
* @return a jumpTable containing the block jump-data or null if no such table exists.
|
||||||
|
* @throws IOException if a RandomAccessInput could not be created from slice.
|
||||||
|
*/
|
||||||
|
public static RandomAccessInput createJumpTable(
|
||||||
|
IndexInput slice, long offset, long length, int jumpTableEntryCount) throws IOException {
|
||||||
|
if (jumpTableEntryCount <= 0) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
int jumpTableBytes = jumpTableEntryCount*Integer.BYTES*2;
|
||||||
|
return slice.randomAccessSlice(offset + length - jumpTableBytes, jumpTableBytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int block = -1;
|
||||||
|
private long blockEnd;
|
||||||
|
private long denseBitmapOffset = -1; // Only used for DENSE blocks
|
||||||
|
private int nextBlockIndex = -1;
|
||||||
|
Method method;
|
||||||
|
|
||||||
|
private int doc = -1;
|
||||||
|
private int index = -1;
|
||||||
|
|
||||||
|
// SPARSE variables
|
||||||
|
private boolean exists;
|
||||||
|
|
||||||
|
// DENSE variables
|
||||||
|
private long word;
|
||||||
|
private int wordIndex = -1;
|
||||||
|
// number of one bits encountered so far, including those of `word`
|
||||||
|
private int numberOfOnes;
|
||||||
|
// Used with rank for jumps inside of DENSE as they are absolute instead of relative
|
||||||
|
private int denseOrigoIndex;
|
||||||
|
|
||||||
|
// ALL variables
|
||||||
|
private int gap;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) throws IOException {
|
||||||
|
final int targetBlock = target & 0xFFFF0000;
|
||||||
|
if (block < targetBlock) {
|
||||||
|
advanceBlock(targetBlock);
|
||||||
|
}
|
||||||
|
if (block == targetBlock) {
|
||||||
|
if (method.advanceWithinBlock(this, target)) {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
readBlockHeader();
|
||||||
|
}
|
||||||
|
boolean found = method.advanceWithinBlock(this, block);
|
||||||
|
assert found;
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean advanceExact(int target) throws IOException {
|
||||||
|
final int targetBlock = target & 0xFFFF0000;
|
||||||
|
if (block < targetBlock) {
|
||||||
|
advanceBlock(targetBlock);
|
||||||
|
}
|
||||||
|
boolean found = block == targetBlock && method.advanceExactWithinBlock(this, target);
|
||||||
|
this.doc = target;
|
||||||
|
return found;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void advanceBlock(int targetBlock) throws IOException {
|
||||||
|
final int blockIndex = targetBlock >> 16;
|
||||||
|
// If the destination block is 2 blocks or more ahead, we use the jump-table.
|
||||||
|
if (jumpTable != null && blockIndex >= (block >> 16)+2) {
|
||||||
|
// If the jumpTableEntryCount is exceeded, there are no further bits. Last entry is always NO_MORE_DOCS
|
||||||
|
final int inRangeBlockIndex = blockIndex < jumpTableEntryCount ? blockIndex : jumpTableEntryCount-1;
|
||||||
|
final int index = jumpTable.readInt(inRangeBlockIndex*Integer.BYTES*2);
|
||||||
|
final int offset = jumpTable.readInt(inRangeBlockIndex*Integer.BYTES*2+Integer.BYTES);
|
||||||
|
this.nextBlockIndex = index-1; // -1 to compensate for the always-added 1 in readBlockHeader
|
||||||
|
slice.seek(offset);
|
||||||
|
readBlockHeader();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to iteration of blocks
|
||||||
|
do {
|
||||||
|
slice.seek(blockEnd);
|
||||||
|
readBlockHeader();
|
||||||
|
} while (block < targetBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readBlockHeader() throws IOException {
|
||||||
|
block = Short.toUnsignedInt(slice.readShort()) << 16;
|
||||||
|
assert block >= 0;
|
||||||
|
final int numValues = 1 + Short.toUnsignedInt(slice.readShort());
|
||||||
|
index = nextBlockIndex;
|
||||||
|
nextBlockIndex = index + numValues;
|
||||||
|
if (numValues <= MAX_ARRAY_LENGTH) {
|
||||||
|
method = Method.SPARSE;
|
||||||
|
blockEnd = slice.getFilePointer() + (numValues << 1);
|
||||||
|
} else if (numValues == 65536) {
|
||||||
|
method = Method.ALL;
|
||||||
|
blockEnd = slice.getFilePointer();
|
||||||
|
gap = block - index - 1;
|
||||||
|
} else {
|
||||||
|
method = Method.DENSE;
|
||||||
|
denseBitmapOffset = slice.getFilePointer() + (denseRankTable == null ? 0 : denseRankTable.length);
|
||||||
|
blockEnd = denseBitmapOffset + (1 << 13);
|
||||||
|
// Performance consideration: All rank (default 128 * 16 bits) are loaded up front. This should be fast with the
|
||||||
|
// reusable byte[] buffer, but it is still wasted if the DENSE block is iterated in small steps.
|
||||||
|
// If this results in too great a performance regression, a heuristic strategy might work where the rank data
|
||||||
|
// are loaded on first in-block advance, if said advance is > X docIDs. The hope being that a small first
|
||||||
|
// advance means that subsequent advances will be small too.
|
||||||
|
// Another alternative is to maintain an extra slice for DENSE rank, but IndexedDISI is already slice-heavy.
|
||||||
|
if (denseRankPower != -1) {
|
||||||
|
slice.readBytes(denseRankTable, 0, denseRankTable.length);
|
||||||
|
}
|
||||||
|
wordIndex = -1;
|
||||||
|
numberOfOnes = index + 1;
|
||||||
|
denseOrigoIndex = numberOfOnes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() throws IOException {
|
||||||
|
return advance(doc + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int index() {
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return cost;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum Method {
|
||||||
|
SPARSE {
|
||||||
|
@Override
|
||||||
|
boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
|
||||||
|
final int targetInBlock = target & 0xFFFF;
|
||||||
|
// TODO: binary search
|
||||||
|
for (; disi.index < disi.nextBlockIndex;) {
|
||||||
|
int doc = Short.toUnsignedInt(disi.slice.readShort());
|
||||||
|
disi.index++;
|
||||||
|
if (doc >= targetInBlock) {
|
||||||
|
disi.doc = disi.block | doc;
|
||||||
|
disi.exists = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException {
|
||||||
|
final int targetInBlock = target & 0xFFFF;
|
||||||
|
// TODO: binary search
|
||||||
|
if (target == disi.doc) {
|
||||||
|
return disi.exists;
|
||||||
|
}
|
||||||
|
for (; disi.index < disi.nextBlockIndex;) {
|
||||||
|
int doc = Short.toUnsignedInt(disi.slice.readShort());
|
||||||
|
disi.index++;
|
||||||
|
if (doc >= targetInBlock) {
|
||||||
|
if (doc != targetInBlock) {
|
||||||
|
disi.index--;
|
||||||
|
disi.slice.seek(disi.slice.getFilePointer() - Short.BYTES);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
disi.exists = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
disi.exists = false;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
DENSE {
|
||||||
|
@Override
|
||||||
|
boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
|
||||||
|
final int targetInBlock = target & 0xFFFF;
|
||||||
|
final int targetWordIndex = targetInBlock >>> 6;
|
||||||
|
|
||||||
|
// If possible, skip ahead using the rank cache
|
||||||
|
rankSkip(disi, target);
|
||||||
|
|
||||||
|
for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
|
||||||
|
disi.word = disi.slice.readLong();
|
||||||
|
disi.numberOfOnes += Long.bitCount(disi.word);
|
||||||
|
}
|
||||||
|
disi.wordIndex = targetWordIndex;
|
||||||
|
|
||||||
|
long leftBits = disi.word >>> target;
|
||||||
|
if (leftBits != 0L) {
|
||||||
|
disi.doc = target + Long.numberOfTrailingZeros(leftBits);
|
||||||
|
disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// There were no set bits at the wanted position. Move forward until one is reached
|
||||||
|
while (++disi.wordIndex < 1024) {
|
||||||
|
// This could use the rank cache to skip empty spaces >= 512 bits, but it seems unrealistic
|
||||||
|
// that such blocks would be DENSE
|
||||||
|
disi.word = disi.slice.readLong();
|
||||||
|
if (disi.word != 0) {
|
||||||
|
disi.index = disi.numberOfOnes;
|
||||||
|
disi.numberOfOnes += Long.bitCount(disi.word);
|
||||||
|
disi.doc = disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// No set bits in the block at or after the wanted position.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException {
|
||||||
|
final int targetInBlock = target & 0xFFFF;
|
||||||
|
final int targetWordIndex = targetInBlock >>> 6;
|
||||||
|
|
||||||
|
rankSkip(disi, target);
|
||||||
|
|
||||||
|
for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
|
||||||
|
disi.word = disi.slice.readLong();
|
||||||
|
disi.numberOfOnes += Long.bitCount(disi.word);
|
||||||
|
}
|
||||||
|
disi.wordIndex = targetWordIndex;
|
||||||
|
|
||||||
|
long leftBits = disi.word >>> target;
|
||||||
|
disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
|
||||||
|
return (leftBits & 1L) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
},
|
||||||
|
ALL {
|
||||||
|
@Override
|
||||||
|
boolean advanceWithinBlock(IndexedDISI disi, int target) {
|
||||||
|
disi.doc = target;
|
||||||
|
disi.index = target - disi.gap;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
boolean advanceExactWithinBlock(IndexedDISI disi, int target) {
|
||||||
|
disi.index = target - disi.gap;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Advance to the first doc from the block that is equal to or greater than {@code target}.
|
||||||
|
* Return true if there is such a doc and false otherwise. */
|
||||||
|
abstract boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException;
|
||||||
|
|
||||||
|
/** Advance the iterator exactly to the position corresponding to the given {@code target}
|
||||||
|
* and return whether this document exists. */
|
||||||
|
abstract boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If the distance between the current position and the target is > 8 words, the rank cache will
|
||||||
|
* be used to guarantee a worst-case of 1 rank-lookup and 7 word-read-and-count-bits operations.
|
||||||
|
* Note: This does not guarantee a skip up to target, only up to nearest rank boundary. It is the
|
||||||
|
* responsibility of the caller to iterate further to reach target.
|
||||||
|
* @param disi standard DISI.
|
||||||
|
* @param target the wanted docID for which to calculate set-flag and index.
|
||||||
|
* @throws IOException if a DISI seek failed.
|
||||||
|
*/
|
||||||
|
private static void rankSkip(IndexedDISI disi, int target) throws IOException {
|
||||||
|
if (disi.denseRankPower == -1) { // No rank for the current structure
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int targetInBlock = target & 0xFFFF; // Lower 16 bits
|
||||||
|
final int targetWordIndex = targetInBlock >>> 6; // long: 2^6 = 64
|
||||||
|
|
||||||
|
// If the distance between the current position and the target is < rank-longs
|
||||||
|
// there is no sense in using rank
|
||||||
|
if (targetWordIndex - disi.wordIndex < (1 << (disi.denseRankPower-6) )) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve the rank as close to targetInBlock as possible (maximum distance is 8 longs)
|
||||||
|
// Note: rankOrigoOffset is tracked on block open, so it is absolute (e.g. don't add origo)
|
||||||
|
final int rankIndex = targetInBlock >> disi.denseRankPower; // Default is 9 (8 longs: 2^3 * 2^6 = 512 docIDs)
|
||||||
|
|
||||||
|
final int rank =
|
||||||
|
(disi.denseRankTable[rankIndex<<1] & 0xFF) << 8 |
|
||||||
|
(disi.denseRankTable[(rankIndex<<1)+1] & 0xFF);
|
||||||
|
|
||||||
|
// Position the counting logic just after the rank point
|
||||||
|
final int rankAlignedWordIndex = rankIndex << disi.denseRankPower >> 6;
|
||||||
|
disi.slice.seek(disi.denseBitmapOffset + rankAlignedWordIndex*Long.BYTES);
|
||||||
|
long rankWord = disi.slice.readLong();
|
||||||
|
int denseNOO = rank + Long.bitCount(rankWord);
|
||||||
|
|
||||||
|
disi.wordIndex = rankAlignedWordIndex;
|
||||||
|
disi.word = rankWord;
|
||||||
|
disi.numberOfOnes = disi.denseOrigoIndex + denseNOO;
|
||||||
|
}
|
||||||
|
}
|
|
@ -37,7 +37,6 @@ import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
|
||||||
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
|
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
|
import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
|
||||||
import org.apache.lucene.codecs.lucene60.Lucene60PointsFormat;
|
import org.apache.lucene.codecs.lucene60.Lucene60PointsFormat;
|
||||||
import org.apache.lucene.codecs.lucene70.Lucene70NormsFormat;
|
|
||||||
import org.apache.lucene.codecs.lucene70.Lucene70SegmentInfoFormat;
|
import org.apache.lucene.codecs.lucene70.Lucene70SegmentInfoFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||||
|
@ -150,7 +149,7 @@ public class Lucene80Codec extends Codec {
|
||||||
/** Returns the docvalues format that should be used for writing
|
/** Returns the docvalues format that should be used for writing
|
||||||
* new segments of <code>field</code>.
|
* new segments of <code>field</code>.
|
||||||
*
|
*
|
||||||
* The default implementation always returns "Lucene70".
|
* The default implementation always returns "Lucene80".
|
||||||
* <p>
|
* <p>
|
||||||
* <b>WARNING:</b> if you subclass, you are responsible for index
|
* <b>WARNING:</b> if you subclass, you are responsible for index
|
||||||
* backwards compatibility: future version of Lucene are only
|
* backwards compatibility: future version of Lucene are only
|
||||||
|
@ -166,9 +165,9 @@ public class Lucene80Codec extends Codec {
|
||||||
}
|
}
|
||||||
|
|
||||||
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
|
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene50");
|
||||||
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene70");
|
private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");
|
||||||
|
|
||||||
private final NormsFormat normsFormat = new Lucene70NormsFormat();
|
private final NormsFormat normsFormat = new Lucene80NormsFormat();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final NormsFormat normsFormat() {
|
public final NormsFormat normsFormat() {
|
||||||
|
|
|
@ -0,0 +1,677 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene80;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.Closeable;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||||
|
import org.apache.lucene.codecs.DocValuesProducer;
|
||||||
|
import org.apache.lucene.index.BinaryDocValues;
|
||||||
|
import org.apache.lucene.index.DocValues;
|
||||||
|
import org.apache.lucene.index.EmptyDocValuesProducer;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.index.SortedDocValues;
|
||||||
|
import org.apache.lucene.index.SortedNumericDocValues;
|
||||||
|
import org.apache.lucene.index.SortedSetDocValues;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.search.SortedSetSelector;
|
||||||
|
import org.apache.lucene.store.ByteBuffersDataOutput;
|
||||||
|
import org.apache.lucene.store.ByteBuffersIndexOutput;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.MathUtil;
|
||||||
|
import org.apache.lucene.util.StringHelper;
|
||||||
|
import org.apache.lucene.util.packed.DirectMonotonicWriter;
|
||||||
|
import org.apache.lucene.util.packed.DirectWriter;
|
||||||
|
|
||||||
|
import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||||
|
import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SHIFT;
|
||||||
|
import static org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE;
|
||||||
|
|
||||||
|
/** writer for {@link Lucene80DocValuesFormat} */
|
||||||
|
final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Closeable {
|
||||||
|
|
||||||
|
IndexOutput data, meta;
|
||||||
|
final int maxDoc;
|
||||||
|
|
||||||
|
/** expert: Creates a new writer */
|
||||||
|
public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||||
|
data = state.directory.createOutput(dataName, state.context);
|
||||||
|
CodecUtil.writeIndexHeader(data, dataCodec, Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||||
|
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||||
|
meta = state.directory.createOutput(metaName, state.context);
|
||||||
|
CodecUtil.writeIndexHeader(meta, metaCodec, Lucene80DocValuesFormat.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||||
|
maxDoc = state.segmentInfo.maxDoc();
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
if (meta != null) {
|
||||||
|
meta.writeInt(-1); // write EOF marker
|
||||||
|
CodecUtil.writeFooter(meta); // write checksum
|
||||||
|
}
|
||||||
|
if (data != null) {
|
||||||
|
CodecUtil.writeFooter(data); // write checksum
|
||||||
|
}
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (success) {
|
||||||
|
IOUtils.close(data, meta);
|
||||||
|
} else {
|
||||||
|
IOUtils.closeWhileHandlingException(data, meta);
|
||||||
|
}
|
||||||
|
meta = data = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||||
|
meta.writeInt(field.number);
|
||||||
|
meta.writeByte(Lucene80DocValuesFormat.NUMERIC);
|
||||||
|
|
||||||
|
writeValues(field, new EmptyDocValuesProducer() {
|
||||||
|
@Override
|
||||||
|
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
|
||||||
|
return DocValues.singleton(valuesProducer.getNumeric(field));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class MinMaxTracker {
|
||||||
|
long min, max, numValues, spaceInBits;
|
||||||
|
|
||||||
|
MinMaxTracker() {
|
||||||
|
reset();
|
||||||
|
spaceInBits = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void reset() {
|
||||||
|
min = Long.MAX_VALUE;
|
||||||
|
max = Long.MIN_VALUE;
|
||||||
|
numValues = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Accumulate a new value. */
|
||||||
|
void update(long v) {
|
||||||
|
min = Math.min(min, v);
|
||||||
|
max = Math.max(max, v);
|
||||||
|
++numValues;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Update the required space. */
|
||||||
|
void finish() {
|
||||||
|
if (max > min) {
|
||||||
|
spaceInBits += DirectWriter.unsignedBitsRequired(max - min) * numValues;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Update space usage and get ready for accumulating values for the next block. */
|
||||||
|
void nextBlock() {
|
||||||
|
finish();
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||||
|
SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
|
||||||
|
int numDocsWithValue = 0;
|
||||||
|
MinMaxTracker minMax = new MinMaxTracker();
|
||||||
|
MinMaxTracker blockMinMax = new MinMaxTracker();
|
||||||
|
long gcd = 0;
|
||||||
|
Set<Long> uniqueValues = new HashSet<>();
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
for (int i = 0, count = values.docValueCount(); i < count; ++i) {
|
||||||
|
long v = values.nextValue();
|
||||||
|
|
||||||
|
if (gcd != 1) {
|
||||||
|
if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
|
||||||
|
// in that case v - minValue might overflow and make the GCD computation return
|
||||||
|
// wrong results. Since these extreme values are unlikely, we just discard
|
||||||
|
// GCD computation for them
|
||||||
|
gcd = 1;
|
||||||
|
} else if (minMax.numValues != 0) { // minValue needs to be set first
|
||||||
|
gcd = MathUtil.gcd(gcd, v - minMax.min);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
minMax.update(v);
|
||||||
|
blockMinMax.update(v);
|
||||||
|
if (blockMinMax.numValues == NUMERIC_BLOCK_SIZE) {
|
||||||
|
blockMinMax.nextBlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (uniqueValues != null
|
||||||
|
&& uniqueValues.add(v)
|
||||||
|
&& uniqueValues.size() > 256) {
|
||||||
|
uniqueValues = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
numDocsWithValue++;
|
||||||
|
}
|
||||||
|
|
||||||
|
minMax.finish();
|
||||||
|
blockMinMax.finish();
|
||||||
|
|
||||||
|
final long numValues = minMax.numValues;
|
||||||
|
long min = minMax.min;
|
||||||
|
final long max = minMax.max;
|
||||||
|
assert blockMinMax.spaceInBits <= minMax.spaceInBits;
|
||||||
|
|
||||||
|
if (numDocsWithValue == 0) { // meta[-2, 0]: No documents with values
|
||||||
|
meta.writeLong(-2); // docsWithFieldOffset
|
||||||
|
meta.writeLong(0L); // docsWithFieldLength
|
||||||
|
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||||
|
meta.writeByte((byte) -1); // denseRankPower
|
||||||
|
} else if (numDocsWithValue == maxDoc) { // meta[-1, 0]: All documents has values
|
||||||
|
meta.writeLong(-1); // docsWithFieldOffset
|
||||||
|
meta.writeLong(0L); // docsWithFieldLength
|
||||||
|
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||||
|
meta.writeByte((byte) -1); // denseRankPower
|
||||||
|
} else { // meta[data.offset, data.length]: IndexedDISI structure for documents with values
|
||||||
|
long offset = data.getFilePointer();
|
||||||
|
meta.writeLong(offset);// docsWithFieldOffset
|
||||||
|
values = valuesProducer.getSortedNumeric(field);
|
||||||
|
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||||
|
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
|
||||||
|
meta.writeShort(jumpTableEntryCount);
|
||||||
|
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||||
|
}
|
||||||
|
|
||||||
|
meta.writeLong(numValues);
|
||||||
|
final int numBitsPerValue;
|
||||||
|
boolean doBlocks = false;
|
||||||
|
Map<Long, Integer> encode = null;
|
||||||
|
if (min >= max) { // meta[-1]: All values are 0
|
||||||
|
numBitsPerValue = 0;
|
||||||
|
meta.writeInt(-1); // tablesize
|
||||||
|
} else {
|
||||||
|
if (uniqueValues != null
|
||||||
|
&& uniqueValues.size() > 1
|
||||||
|
&& DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1) < DirectWriter.unsignedBitsRequired((max - min) / gcd)) {
|
||||||
|
numBitsPerValue = DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1);
|
||||||
|
final Long[] sortedUniqueValues = uniqueValues.toArray(new Long[0]);
|
||||||
|
Arrays.sort(sortedUniqueValues);
|
||||||
|
meta.writeInt(sortedUniqueValues.length); // tablesize
|
||||||
|
for (Long v : sortedUniqueValues) {
|
||||||
|
meta.writeLong(v); // table[] entry
|
||||||
|
}
|
||||||
|
encode = new HashMap<>();
|
||||||
|
for (int i = 0; i < sortedUniqueValues.length; ++i) {
|
||||||
|
encode.put(sortedUniqueValues[i], i);
|
||||||
|
}
|
||||||
|
min = 0;
|
||||||
|
gcd = 1;
|
||||||
|
} else {
|
||||||
|
uniqueValues = null;
|
||||||
|
// we do blocks if that appears to save 10+% storage
|
||||||
|
doBlocks = minMax.spaceInBits > 0 && (double) blockMinMax.spaceInBits / minMax.spaceInBits <= 0.9;
|
||||||
|
if (doBlocks) {
|
||||||
|
numBitsPerValue = 0xFF;
|
||||||
|
meta.writeInt(-2 - NUMERIC_BLOCK_SHIFT); // tablesize
|
||||||
|
} else {
|
||||||
|
numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / gcd);
|
||||||
|
if (gcd == 1 && min > 0
|
||||||
|
&& DirectWriter.unsignedBitsRequired(max) == DirectWriter.unsignedBitsRequired(max - min)) {
|
||||||
|
min = 0;
|
||||||
|
}
|
||||||
|
meta.writeInt(-1); // tablesize
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
meta.writeByte((byte) numBitsPerValue);
|
||||||
|
meta.writeLong(min);
|
||||||
|
meta.writeLong(gcd);
|
||||||
|
long startOffset = data.getFilePointer();
|
||||||
|
meta.writeLong(startOffset); // valueOffset
|
||||||
|
long jumpTableOffset = -1;
|
||||||
|
if (doBlocks) {
|
||||||
|
jumpTableOffset = writeValuesMultipleBlocks(valuesProducer.getSortedNumeric(field), gcd);
|
||||||
|
} else if (numBitsPerValue != 0) {
|
||||||
|
writeValuesSingleBlock(valuesProducer.getSortedNumeric(field), numValues, numBitsPerValue, min, gcd, encode);
|
||||||
|
}
|
||||||
|
meta.writeLong(data.getFilePointer() - startOffset); // valuesLength
|
||||||
|
meta.writeLong(jumpTableOffset);
|
||||||
|
return new long[] {numDocsWithValue, numValues};
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeValuesSingleBlock(SortedNumericDocValues values, long numValues, int numBitsPerValue,
|
||||||
|
long min, long gcd, Map<Long, Integer> encode) throws IOException {
|
||||||
|
DirectWriter writer = DirectWriter.getInstance(data, numValues, numBitsPerValue);
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
for (int i = 0, count = values.docValueCount(); i < count; ++i) {
|
||||||
|
long v = values.nextValue();
|
||||||
|
if (encode == null) {
|
||||||
|
writer.add((v - min) / gcd);
|
||||||
|
} else {
|
||||||
|
writer.add(encode.get(v));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
writer.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the offset to the jump-table for vBPV
|
||||||
|
private long writeValuesMultipleBlocks(SortedNumericDocValues values, long gcd) throws IOException {
|
||||||
|
long[] offsets = new long[ArrayUtil.oversize(1, Long.BYTES)];
|
||||||
|
int offsetsIndex = 0;
|
||||||
|
final long[] buffer = new long[NUMERIC_BLOCK_SIZE];
|
||||||
|
final ByteBuffersDataOutput encodeBuffer = ByteBuffersDataOutput.newResettableInstance();
|
||||||
|
int upTo = 0;
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
for (int i = 0, count = values.docValueCount(); i < count; ++i) {
|
||||||
|
buffer[upTo++] = values.nextValue();
|
||||||
|
if (upTo == NUMERIC_BLOCK_SIZE) {
|
||||||
|
offsets = ArrayUtil.grow(offsets, offsetsIndex+1);
|
||||||
|
offsets[offsetsIndex++] = data.getFilePointer();
|
||||||
|
writeBlock(buffer, NUMERIC_BLOCK_SIZE, gcd, encodeBuffer);
|
||||||
|
upTo = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (upTo > 0) {
|
||||||
|
offsets = ArrayUtil.grow(offsets, offsetsIndex+1);
|
||||||
|
offsets[offsetsIndex++] = data.getFilePointer();
|
||||||
|
writeBlock(buffer, upTo, gcd, encodeBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// All blocks has been written. Flush the offset jump-table
|
||||||
|
final long offsetsOrigo = data.getFilePointer();
|
||||||
|
for (int i = 0 ; i < offsetsIndex ; i++) {
|
||||||
|
data.writeLong(offsets[i]);
|
||||||
|
}
|
||||||
|
data.writeLong(offsetsOrigo);
|
||||||
|
return offsetsOrigo;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeBlock(long[] values, int length, long gcd, ByteBuffersDataOutput buffer) throws IOException {
|
||||||
|
assert length > 0;
|
||||||
|
long min = values[0];
|
||||||
|
long max = values[0];
|
||||||
|
for (int i = 1; i < length; ++i) {
|
||||||
|
final long v = values[i];
|
||||||
|
assert Math.floorMod(values[i] - min, gcd) == 0;
|
||||||
|
min = Math.min(min, v);
|
||||||
|
max = Math.max(max, v);
|
||||||
|
}
|
||||||
|
if (min == max) {
|
||||||
|
data.writeByte((byte) 0);
|
||||||
|
data.writeLong(min);
|
||||||
|
} else {
|
||||||
|
final int bitsPerValue = DirectWriter.unsignedBitsRequired(max - min);
|
||||||
|
buffer.reset();
|
||||||
|
assert buffer.size() == 0;
|
||||||
|
final DirectWriter w = DirectWriter.getInstance(buffer, length, bitsPerValue);
|
||||||
|
for (int i = 0; i < length; ++i) {
|
||||||
|
w.add((values[i] - min) / gcd);
|
||||||
|
}
|
||||||
|
w.finish();
|
||||||
|
data.writeByte((byte) bitsPerValue);
|
||||||
|
data.writeLong(min);
|
||||||
|
data.writeInt(Math.toIntExact(buffer.size()));
|
||||||
|
buffer.copyTo(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||||
|
meta.writeInt(field.number);
|
||||||
|
meta.writeByte(Lucene80DocValuesFormat.BINARY);
|
||||||
|
|
||||||
|
BinaryDocValues values = valuesProducer.getBinary(field);
|
||||||
|
long start = data.getFilePointer();
|
||||||
|
meta.writeLong(start); // dataOffset
|
||||||
|
int numDocsWithField = 0;
|
||||||
|
int minLength = Integer.MAX_VALUE;
|
||||||
|
int maxLength = 0;
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
numDocsWithField++;
|
||||||
|
BytesRef v = values.binaryValue();
|
||||||
|
int length = v.length;
|
||||||
|
data.writeBytes(v.bytes, v.offset, v.length);
|
||||||
|
minLength = Math.min(length, minLength);
|
||||||
|
maxLength = Math.max(length, maxLength);
|
||||||
|
}
|
||||||
|
assert numDocsWithField <= maxDoc;
|
||||||
|
meta.writeLong(data.getFilePointer() - start); // dataLength
|
||||||
|
|
||||||
|
if (numDocsWithField == 0) {
|
||||||
|
meta.writeLong(-2); // docsWithFieldOffset
|
||||||
|
meta.writeLong(0L); // docsWithFieldLength
|
||||||
|
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||||
|
meta.writeByte((byte) -1); // denseRankPower
|
||||||
|
} else if (numDocsWithField == maxDoc) {
|
||||||
|
meta.writeLong(-1); // docsWithFieldOffset
|
||||||
|
meta.writeLong(0L); // docsWithFieldLength
|
||||||
|
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||||
|
meta.writeByte((byte) -1); // denseRankPower
|
||||||
|
} else {
|
||||||
|
long offset = data.getFilePointer();
|
||||||
|
meta.writeLong(offset); // docsWithFieldOffset
|
||||||
|
values = valuesProducer.getBinary(field);
|
||||||
|
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||||
|
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
|
||||||
|
meta.writeShort(jumpTableEntryCount);
|
||||||
|
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||||
|
}
|
||||||
|
|
||||||
|
meta.writeInt(numDocsWithField);
|
||||||
|
meta.writeInt(minLength);
|
||||||
|
meta.writeInt(maxLength);
|
||||||
|
if (maxLength > minLength) {
|
||||||
|
start = data.getFilePointer();
|
||||||
|
meta.writeLong(start);
|
||||||
|
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||||
|
|
||||||
|
final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||||
|
long addr = 0;
|
||||||
|
writer.add(addr);
|
||||||
|
values = valuesProducer.getBinary(field);
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
addr += values.binaryValue().length;
|
||||||
|
writer.add(addr);
|
||||||
|
}
|
||||||
|
writer.finish();
|
||||||
|
meta.writeLong(data.getFilePointer() - start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||||
|
meta.writeInt(field.number);
|
||||||
|
meta.writeByte(Lucene80DocValuesFormat.SORTED);
|
||||||
|
doAddSortedField(field, valuesProducer);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||||
|
SortedDocValues values = valuesProducer.getSorted(field);
|
||||||
|
int numDocsWithField = 0;
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
numDocsWithField++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numDocsWithField == 0) {
|
||||||
|
meta.writeLong(-2); // docsWithFieldOffset
|
||||||
|
meta.writeLong(0L); // docsWithFieldLength
|
||||||
|
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||||
|
meta.writeByte((byte) -1); // denseRankPower
|
||||||
|
} else if (numDocsWithField == maxDoc) {
|
||||||
|
meta.writeLong(-1); // docsWithFieldOffset
|
||||||
|
meta.writeLong(0L); // docsWithFieldLength
|
||||||
|
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||||
|
meta.writeByte((byte) -1); // denseRankPower
|
||||||
|
} else {
|
||||||
|
long offset = data.getFilePointer();
|
||||||
|
meta.writeLong(offset); // docsWithFieldOffset
|
||||||
|
values = valuesProducer.getSorted(field);
|
||||||
|
final short jumpTableentryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||||
|
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
|
||||||
|
meta.writeShort(jumpTableentryCount);
|
||||||
|
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||||
|
}
|
||||||
|
|
||||||
|
meta.writeInt(numDocsWithField);
|
||||||
|
if (values.getValueCount() <= 1) {
|
||||||
|
meta.writeByte((byte) 0); // bitsPerValue
|
||||||
|
meta.writeLong(0L); // ordsOffset
|
||||||
|
meta.writeLong(0L); // ordsLength
|
||||||
|
} else {
|
||||||
|
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
|
||||||
|
meta.writeByte((byte) numberOfBitsPerOrd); // bitsPerValue
|
||||||
|
long start = data.getFilePointer();
|
||||||
|
meta.writeLong(start); // ordsOffset
|
||||||
|
DirectWriter writer = DirectWriter.getInstance(data, numDocsWithField, numberOfBitsPerOrd);
|
||||||
|
values = valuesProducer.getSorted(field);
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
writer.add(values.ordValue());
|
||||||
|
}
|
||||||
|
writer.finish();
|
||||||
|
meta.writeLong(data.getFilePointer() - start); // ordsLength
|
||||||
|
}
|
||||||
|
|
||||||
|
addTermsDict(DocValues.singleton(valuesProducer.getSorted(field)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addTermsDict(SortedSetDocValues values) throws IOException {
|
||||||
|
final long size = values.getValueCount();
|
||||||
|
meta.writeVLong(size);
|
||||||
|
meta.writeInt(Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT);
|
||||||
|
|
||||||
|
ByteBuffersDataOutput addressBuffer = new ByteBuffersDataOutput();
|
||||||
|
ByteBuffersIndexOutput addressOutput = new ByteBuffersIndexOutput(addressBuffer, "temp", "temp");
|
||||||
|
meta.writeInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||||
|
long numBlocks = (size + Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK) >>> Lucene80DocValuesFormat.TERMS_DICT_BLOCK_SHIFT;
|
||||||
|
DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, addressOutput, numBlocks, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||||
|
|
||||||
|
BytesRefBuilder previous = new BytesRefBuilder();
|
||||||
|
long ord = 0;
|
||||||
|
long start = data.getFilePointer();
|
||||||
|
int maxLength = 0;
|
||||||
|
TermsEnum iterator = values.termsEnum();
|
||||||
|
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
||||||
|
if ((ord & Lucene80DocValuesFormat.TERMS_DICT_BLOCK_MASK) == 0) {
|
||||||
|
writer.add(data.getFilePointer() - start);
|
||||||
|
data.writeVInt(term.length);
|
||||||
|
data.writeBytes(term.bytes, term.offset, term.length);
|
||||||
|
} else {
|
||||||
|
final int prefixLength = StringHelper.bytesDifference(previous.get(), term);
|
||||||
|
final int suffixLength = term.length - prefixLength;
|
||||||
|
assert suffixLength > 0; // terms are unique
|
||||||
|
|
||||||
|
data.writeByte((byte) (Math.min(prefixLength, 15) | (Math.min(15, suffixLength - 1) << 4)));
|
||||||
|
if (prefixLength >= 15) {
|
||||||
|
data.writeVInt(prefixLength - 15);
|
||||||
|
}
|
||||||
|
if (suffixLength >= 16) {
|
||||||
|
data.writeVInt(suffixLength - 16);
|
||||||
|
}
|
||||||
|
data.writeBytes(term.bytes, term.offset + prefixLength, term.length - prefixLength);
|
||||||
|
}
|
||||||
|
maxLength = Math.max(maxLength, term.length);
|
||||||
|
previous.copyBytes(term);
|
||||||
|
++ord;
|
||||||
|
}
|
||||||
|
writer.finish();
|
||||||
|
meta.writeInt(maxLength);
|
||||||
|
meta.writeLong(start);
|
||||||
|
meta.writeLong(data.getFilePointer() - start);
|
||||||
|
start = data.getFilePointer();
|
||||||
|
addressBuffer.copyTo(data);
|
||||||
|
meta.writeLong(start);
|
||||||
|
meta.writeLong(data.getFilePointer() - start);
|
||||||
|
|
||||||
|
// Now write the reverse terms index
|
||||||
|
writeTermsIndex(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeTermsIndex(SortedSetDocValues values) throws IOException {
|
||||||
|
final long size = values.getValueCount();
|
||||||
|
meta.writeInt(Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT);
|
||||||
|
long start = data.getFilePointer();
|
||||||
|
|
||||||
|
long numBlocks = 1L + ((size + Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) >>> Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_SHIFT);
|
||||||
|
ByteBuffersDataOutput addressBuffer = new ByteBuffersDataOutput();
|
||||||
|
DirectMonotonicWriter writer;
|
||||||
|
try (ByteBuffersIndexOutput addressOutput = new ByteBuffersIndexOutput(addressBuffer, "temp", "temp")) {
|
||||||
|
writer = DirectMonotonicWriter.getInstance(meta, addressOutput, numBlocks, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||||
|
TermsEnum iterator = values.termsEnum();
|
||||||
|
BytesRefBuilder previous = new BytesRefBuilder();
|
||||||
|
long offset = 0;
|
||||||
|
long ord = 0;
|
||||||
|
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
||||||
|
if ((ord & Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) == 0) {
|
||||||
|
writer.add(offset);
|
||||||
|
final int sortKeyLength;
|
||||||
|
if (ord == 0) {
|
||||||
|
// no previous term: no bytes to write
|
||||||
|
sortKeyLength = 0;
|
||||||
|
} else {
|
||||||
|
sortKeyLength = StringHelper.sortKeyLength(previous.get(), term);
|
||||||
|
}
|
||||||
|
offset += sortKeyLength;
|
||||||
|
data.writeBytes(term.bytes, term.offset, sortKeyLength);
|
||||||
|
} else if ((ord & Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) == Lucene80DocValuesFormat.TERMS_DICT_REVERSE_INDEX_MASK) {
|
||||||
|
previous.copyBytes(term);
|
||||||
|
}
|
||||||
|
++ord;
|
||||||
|
}
|
||||||
|
writer.add(offset);
|
||||||
|
writer.finish();
|
||||||
|
meta.writeLong(start);
|
||||||
|
meta.writeLong(data.getFilePointer() - start);
|
||||||
|
start = data.getFilePointer();
|
||||||
|
addressBuffer.copyTo(data);
|
||||||
|
meta.writeLong(start);
|
||||||
|
meta.writeLong(data.getFilePointer() - start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||||
|
meta.writeInt(field.number);
|
||||||
|
meta.writeByte(Lucene80DocValuesFormat.SORTED_NUMERIC);
|
||||||
|
|
||||||
|
long[] stats = writeValues(field, valuesProducer);
|
||||||
|
int numDocsWithField = Math.toIntExact(stats[0]);
|
||||||
|
long numValues = stats[1];
|
||||||
|
assert numValues >= numDocsWithField;
|
||||||
|
|
||||||
|
meta.writeInt(numDocsWithField);
|
||||||
|
if (numValues > numDocsWithField) {
|
||||||
|
long start = data.getFilePointer();
|
||||||
|
meta.writeLong(start);
|
||||||
|
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||||
|
|
||||||
|
final DirectMonotonicWriter addressesWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1L, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||||
|
long addr = 0;
|
||||||
|
addressesWriter.add(addr);
|
||||||
|
SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
addr += values.docValueCount();
|
||||||
|
addressesWriter.add(addr);
|
||||||
|
}
|
||||||
|
addressesWriter.finish();
|
||||||
|
meta.writeLong(data.getFilePointer() - start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
|
||||||
|
meta.writeInt(field.number);
|
||||||
|
meta.writeByte(Lucene80DocValuesFormat.SORTED_SET);
|
||||||
|
|
||||||
|
SortedSetDocValues values = valuesProducer.getSortedSet(field);
|
||||||
|
int numDocsWithField = 0;
|
||||||
|
long numOrds = 0;
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
numDocsWithField++;
|
||||||
|
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
|
||||||
|
numOrds++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numDocsWithField == numOrds) {
|
||||||
|
meta.writeByte((byte) 0); // multiValued (0 = singleValued)
|
||||||
|
doAddSortedField(field, new EmptyDocValuesProducer() {
|
||||||
|
@Override
|
||||||
|
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||||
|
return SortedSetSelector.wrap(valuesProducer.getSortedSet(field), SortedSetSelector.Type.MIN);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
meta.writeByte((byte) 1); // multiValued (1 = multiValued)
|
||||||
|
|
||||||
|
assert numDocsWithField != 0;
|
||||||
|
if (numDocsWithField == maxDoc) {
|
||||||
|
meta.writeLong(-1); // docsWithFieldOffset
|
||||||
|
meta.writeLong(0L); // docsWithFieldLength
|
||||||
|
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||||
|
meta.writeByte((byte) -1); // denseRankPower
|
||||||
|
} else {
|
||||||
|
long offset = data.getFilePointer();
|
||||||
|
meta.writeLong(offset); // docsWithFieldOffset
|
||||||
|
values = valuesProducer.getSortedSet(field);
|
||||||
|
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||||
|
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
|
||||||
|
meta.writeShort(jumpTableEntryCount);
|
||||||
|
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||||
|
}
|
||||||
|
|
||||||
|
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
|
||||||
|
meta.writeByte((byte) numberOfBitsPerOrd); // bitsPerValue
|
||||||
|
long start = data.getFilePointer();
|
||||||
|
meta.writeLong(start); // ordsOffset
|
||||||
|
DirectWriter writer = DirectWriter.getInstance(data, numOrds, numberOfBitsPerOrd);
|
||||||
|
values = valuesProducer.getSortedSet(field);
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
|
||||||
|
writer.add(ord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
writer.finish();
|
||||||
|
meta.writeLong(data.getFilePointer() - start); // ordsLength
|
||||||
|
|
||||||
|
meta.writeInt(numDocsWithField);
|
||||||
|
start = data.getFilePointer();
|
||||||
|
meta.writeLong(start); // addressesOffset
|
||||||
|
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||||
|
|
||||||
|
final DirectMonotonicWriter addressesWriter = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
|
||||||
|
long addr = 0;
|
||||||
|
addressesWriter.add(addr);
|
||||||
|
values = valuesProducer.getSortedSet(field);
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
values.nextOrd();
|
||||||
|
addr++;
|
||||||
|
while (values.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) {
|
||||||
|
addr++;
|
||||||
|
}
|
||||||
|
addressesWriter.add(addr);
|
||||||
|
}
|
||||||
|
addressesWriter.finish();
|
||||||
|
meta.writeLong(data.getFilePointer() - start); // addressesLength
|
||||||
|
|
||||||
|
addTermsDict(values);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,175 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene80;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||||
|
import org.apache.lucene.codecs.DocValuesFormat;
|
||||||
|
import org.apache.lucene.codecs.DocValuesProducer;
|
||||||
|
import org.apache.lucene.index.DocValuesType;
|
||||||
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
import org.apache.lucene.util.SmallFloat;
|
||||||
|
import org.apache.lucene.util.packed.DirectWriter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lucene 8.0 DocValues format.
|
||||||
|
* <p>
|
||||||
|
* Documents that have a value for the field are encoded in a way that it is always possible to
|
||||||
|
* know the ordinal of the current document in the set of documents that have a value. For instance,
|
||||||
|
* say the set of documents that have a value for the field is <tt>{1, 5, 6, 11}</tt>. When the
|
||||||
|
* iterator is on <tt>6</tt>, it knows that this is the 3rd item of the set. This way, values can
|
||||||
|
* be stored densely and accessed based on their index at search time. If all documents in a segment
|
||||||
|
* have a value for the field, the index is the same as the doc ID, so this case is encoded implicitly
|
||||||
|
* and is very fast at query time. On the other hand if some documents are missing a value for the
|
||||||
|
* field then the set of documents that have a value is encoded into blocks. All doc IDs that share
|
||||||
|
* the same upper 16 bits are encoded into the same block with the following strategies:
|
||||||
|
* <ul>
|
||||||
|
* <li>SPARSE: This strategy is used when a block contains at most 4095 documents. The lower 16
|
||||||
|
* bits of doc IDs are stored as {@link DataOutput#writeShort(short) shorts} while the upper
|
||||||
|
* 16 bits are given by the block ID.
|
||||||
|
* <li>DENSE: This strategy is used when a block contains between 4096 and 65535 documents. The
|
||||||
|
* lower bits of doc IDs are stored in a bit set. Advancing < 512 documents is performed using
|
||||||
|
* {@link Long#numberOfTrailingZeros(long) ntz} operations while the index is computed by
|
||||||
|
* accumulating the {@link Long#bitCount(long) bit counts} of the visited longs.
|
||||||
|
* Advancing >= 512 documents is performed by skipping to the start of the needed 512 document
|
||||||
|
* sub-block and iterating to the specific document within that block. The index for the
|
||||||
|
* sub-block that is skipped to is retrieved from a rank-table positioned beforethe bit set.
|
||||||
|
* The rank-table holds the origo index numbers for all 512 documents sub-blocks, represented
|
||||||
|
* as an unsigned short for each 128 blocks.
|
||||||
|
* <li>ALL: This strategy is used when a block contains exactly 65536 documents, meaning that
|
||||||
|
* the block is full. In that case doc IDs do not need to be stored explicitly. This is
|
||||||
|
* typically faster than both SPARSE and DENSE which is a reason why it is preferable to have
|
||||||
|
* all documents that have a value for a field using contiguous doc IDs, for instance by
|
||||||
|
* using {@link IndexWriterConfig#setIndexSort(org.apache.lucene.search.Sort) index sorting}.
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* Skipping blocks to arrive at a wanted document is either done on an iterative basis or by using the
|
||||||
|
* jump-table stored at the end of the chain of blocks. The jump-table holds the offset as well as the
|
||||||
|
* index for all blocks, packed in a single long per block.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* Then the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) are
|
||||||
|
* encoded using the following strategies:
|
||||||
|
* <p>
|
||||||
|
* {@link DocValuesType#NUMERIC NUMERIC}:
|
||||||
|
* <ul>
|
||||||
|
* <li>Delta-compressed: per-document integers written as deltas from the minimum value,
|
||||||
|
* compressed with bitpacking. For more information, see {@link DirectWriter}.
|
||||||
|
* <li>Table-compressed: when the number of unique values is very small (< 256), and
|
||||||
|
* when there are unused "gaps" in the range of values used (such as {@link SmallFloat}),
|
||||||
|
* a lookup table is written instead. Each per-document entry is instead the ordinal
|
||||||
|
* to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}).
|
||||||
|
* <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest
|
||||||
|
* common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics.
|
||||||
|
* <li>Monotonic-compressed: when all numbers are monotonically increasing offsets, they are written
|
||||||
|
* as blocks of bitpacked integers, encoding the deviation from the expected delta.
|
||||||
|
* <li>Const-compressed: when there is only one possible value, no per-document data is needed and
|
||||||
|
* this value is encoded alone.
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* Depending on calculated gains, the numbers might be split into blocks of 16384 values. In that case,
|
||||||
|
* a jump-table with block offsets is appended to the blocks for O(1) access to the needed block.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* {@link DocValuesType#BINARY BINARY}:
|
||||||
|
* <ul>
|
||||||
|
* <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length.
|
||||||
|
* Each document's value can be addressed directly with multiplication ({@code docID * length}).
|
||||||
|
* <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses
|
||||||
|
* for each document. The addresses are written as Monotonic-compressed numerics.
|
||||||
|
* <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written
|
||||||
|
* completely and other values sharing prefixes. chunk addresses are written as Monotonic-compressed
|
||||||
|
* numerics. A reverse lookup index is written from a portion of every 1024th term.
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* {@link DocValuesType#SORTED SORTED}:
|
||||||
|
* <ul>
|
||||||
|
* <li>Sorted: a mapping of ordinals to deduplicated terms is written as Prefix-compressed Binary,
|
||||||
|
* along with the per-document ordinals written using one of the numeric strategies above.
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* {@link DocValuesType#SORTED_SET SORTED_SET}:
|
||||||
|
* <ul>
|
||||||
|
* <li>Single: if all documents have 0 or 1 value, then data are written like SORTED.
|
||||||
|
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary,
|
||||||
|
* an ordinal list and per-document index into this list are written using the numeric strategies
|
||||||
|
* above.
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}:
|
||||||
|
* <ul>
|
||||||
|
* <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC.
|
||||||
|
* <li>SortedNumeric: a value list and per-document index into this list are written using the numeric
|
||||||
|
* strategies above.
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
* Files:
|
||||||
|
* <ol>
|
||||||
|
* <li><tt>.dvd</tt>: DocValues data</li>
|
||||||
|
* <li><tt>.dvm</tt>: DocValues metadata</li>
|
||||||
|
* </ol>
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public final class Lucene80DocValuesFormat extends DocValuesFormat {
|
||||||
|
|
||||||
|
/** Sole Constructor */
|
||||||
|
public Lucene80DocValuesFormat() {
|
||||||
|
super("Lucene80");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||||
|
return new Lucene80DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||||
|
return new Lucene80DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
|
||||||
|
}
|
||||||
|
|
||||||
|
static final String DATA_CODEC = "Lucene80DocValuesData";
|
||||||
|
static final String DATA_EXTENSION = "dvd";
|
||||||
|
static final String META_CODEC = "Lucene80DocValuesMetadata";
|
||||||
|
static final String META_EXTENSION = "dvm";
|
||||||
|
static final int VERSION_START = 0;
|
||||||
|
static final int VERSION_CURRENT = VERSION_START;
|
||||||
|
|
||||||
|
// indicates docvalues type
|
||||||
|
static final byte NUMERIC = 0;
|
||||||
|
static final byte BINARY = 1;
|
||||||
|
static final byte SORTED = 2;
|
||||||
|
static final byte SORTED_SET = 3;
|
||||||
|
static final byte SORTED_NUMERIC = 4;
|
||||||
|
|
||||||
|
static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16;
|
||||||
|
|
||||||
|
static final int NUMERIC_BLOCK_SHIFT = 14;
|
||||||
|
static final int NUMERIC_BLOCK_SIZE = 1 << NUMERIC_BLOCK_SHIFT;
|
||||||
|
|
||||||
|
static final int TERMS_DICT_BLOCK_SHIFT = 4;
|
||||||
|
static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT;
|
||||||
|
static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1;
|
||||||
|
|
||||||
|
static final int TERMS_DICT_REVERSE_INDEX_SHIFT = 10;
|
||||||
|
static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT;
|
||||||
|
static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1;
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,165 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene80;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.NormsConsumer;
|
||||||
|
import org.apache.lucene.codecs.NormsProducer;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
import static org.apache.lucene.codecs.lucene80.Lucene80NormsFormat.VERSION_CURRENT;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writer for {@link Lucene80NormsFormat}
|
||||||
|
*/
|
||||||
|
final class Lucene80NormsConsumer extends NormsConsumer {
|
||||||
|
IndexOutput data, meta;
|
||||||
|
final int maxDoc;
|
||||||
|
|
||||||
|
Lucene80NormsConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||||
|
data = state.directory.createOutput(dataName, state.context);
|
||||||
|
CodecUtil.writeIndexHeader(data, dataCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||||
|
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||||
|
meta = state.directory.createOutput(metaName, state.context);
|
||||||
|
CodecUtil.writeIndexHeader(meta, metaCodec, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||||
|
maxDoc = state.segmentInfo.maxDoc();
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
if (meta != null) {
|
||||||
|
meta.writeInt(-1); // write EOF marker
|
||||||
|
CodecUtil.writeFooter(meta); // write checksum
|
||||||
|
}
|
||||||
|
if (data != null) {
|
||||||
|
CodecUtil.writeFooter(data); // write checksum
|
||||||
|
}
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (success) {
|
||||||
|
IOUtils.close(data, meta);
|
||||||
|
} else {
|
||||||
|
IOUtils.closeWhileHandlingException(data, meta);
|
||||||
|
}
|
||||||
|
meta = data = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addNormsField(FieldInfo field, NormsProducer normsProducer) throws IOException {
|
||||||
|
NumericDocValues values = normsProducer.getNorms(field);
|
||||||
|
int numDocsWithValue = 0;
|
||||||
|
long min = Long.MAX_VALUE;
|
||||||
|
long max = Long.MIN_VALUE;
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
numDocsWithValue++;
|
||||||
|
long v = values.longValue();
|
||||||
|
min = Math.min(min, v);
|
||||||
|
max = Math.max(max, v);
|
||||||
|
}
|
||||||
|
assert numDocsWithValue <= maxDoc;
|
||||||
|
|
||||||
|
meta.writeInt(field.number);
|
||||||
|
|
||||||
|
if (numDocsWithValue == 0) {
|
||||||
|
meta.writeLong(-2); // docsWithFieldOffset
|
||||||
|
meta.writeLong(0L); // docsWithFieldLength
|
||||||
|
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||||
|
meta.writeByte((byte) -1); // denseRankPower
|
||||||
|
} else if (numDocsWithValue == maxDoc) {
|
||||||
|
meta.writeLong(-1); // docsWithFieldOffset
|
||||||
|
meta.writeLong(0L); // docsWithFieldLength
|
||||||
|
meta.writeShort((short) -1); // jumpTableEntryCount
|
||||||
|
meta.writeByte((byte) -1); // denseRankPower
|
||||||
|
} else {
|
||||||
|
long offset = data.getFilePointer();
|
||||||
|
meta.writeLong(offset); // docsWithFieldOffset
|
||||||
|
values = normsProducer.getNorms(field);
|
||||||
|
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||||
|
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
|
||||||
|
meta.writeShort(jumpTableEntryCount);
|
||||||
|
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
|
||||||
|
}
|
||||||
|
|
||||||
|
meta.writeInt(numDocsWithValue);
|
||||||
|
int numBytesPerValue = numBytesPerValue(min, max);
|
||||||
|
|
||||||
|
meta.writeByte((byte) numBytesPerValue);
|
||||||
|
if (numBytesPerValue == 0) {
|
||||||
|
meta.writeLong(min);
|
||||||
|
} else {
|
||||||
|
meta.writeLong(data.getFilePointer()); // normsOffset
|
||||||
|
values = normsProducer.getNorms(field);
|
||||||
|
writeValues(values, numBytesPerValue, data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int numBytesPerValue(long min, long max) {
|
||||||
|
if (min >= max) {
|
||||||
|
return 0;
|
||||||
|
} else if (min >= Byte.MIN_VALUE && max <= Byte.MAX_VALUE) {
|
||||||
|
return 1;
|
||||||
|
} else if (min >= Short.MIN_VALUE && max <= Short.MAX_VALUE) {
|
||||||
|
return 2;
|
||||||
|
} else if (min >= Integer.MIN_VALUE && max <= Integer.MAX_VALUE) {
|
||||||
|
return 4;
|
||||||
|
} else {
|
||||||
|
return 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void writeValues(NumericDocValues values, int numBytesPerValue, IndexOutput out) throws IOException, AssertionError {
|
||||||
|
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
|
||||||
|
long value = values.longValue();
|
||||||
|
switch (numBytesPerValue) {
|
||||||
|
case 1:
|
||||||
|
out.writeByte((byte) value);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
out.writeShort((short) value);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
out.writeInt((int) value);
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
out.writeLong(value);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new AssertionError();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,99 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene80;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.NormsConsumer;
|
||||||
|
import org.apache.lucene.codecs.NormsFormat;
|
||||||
|
import org.apache.lucene.codecs.NormsProducer;
|
||||||
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lucene 8.0 Score normalization format.
|
||||||
|
* <p>
|
||||||
|
* Encodes normalization values by encoding each value with the minimum
|
||||||
|
* number of bytes needed to represent the range (which can be zero).
|
||||||
|
* <p>
|
||||||
|
* Files:
|
||||||
|
* <ol>
|
||||||
|
* <li><tt>.nvd</tt>: Norms data</li>
|
||||||
|
* <li><tt>.nvm</tt>: Norms metadata</li>
|
||||||
|
* </ol>
|
||||||
|
* <ol>
|
||||||
|
* <li><a name="nvm"></a>
|
||||||
|
* <p>The Norms metadata or .nvm file.</p>
|
||||||
|
* <p>For each norms field, this stores metadata, such as the offset into the
|
||||||
|
* Norms data (.nvd)</p>
|
||||||
|
* <p>Norms metadata (.dvm) --> Header,<Entry><sup>NumFields</sup>,Footer</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||||
|
* <li>Entry --> FieldNumber, DocsWithFieldAddress, DocsWithFieldLength, NumDocsWithField, BytesPerNorm, NormsAddress</li>
|
||||||
|
* <li>FieldNumber --> {@link DataOutput#writeInt Int32}</li>
|
||||||
|
* <li>DocsWithFieldAddress --> {@link DataOutput#writeLong Int64}</li>
|
||||||
|
* <li>DocsWithFieldLength --> {@link DataOutput#writeLong Int64}</li>
|
||||||
|
* <li>NumDocsWithField --> {@link DataOutput#writeInt Int32}</li>
|
||||||
|
* <li>BytesPerNorm --> {@link DataOutput#writeByte byte}</li>
|
||||||
|
* <li>NormsAddress --> {@link DataOutput#writeLong Int64}</li>
|
||||||
|
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||||
|
* </ul>
|
||||||
|
* <p>FieldNumber of -1 indicates the end of metadata.</p>
|
||||||
|
* <p>NormsAddress is the pointer to the start of the data in the norms data (.nvd), or the singleton value
|
||||||
|
* when BytesPerValue = 0. If BytesPerValue is different from 0 then there are NumDocsWithField values
|
||||||
|
* to read at that offset.</p>
|
||||||
|
* <p>DocsWithFieldAddress is the pointer to the start of the bit set containing documents that have a norm
|
||||||
|
* in the norms data (.nvd), or -2 if no documents have a norm value, or -1 if all documents have a norm
|
||||||
|
* value.</p>
|
||||||
|
* <p>DocsWithFieldLength is the number of bytes used to encode the set of documents that have a norm.</p>
|
||||||
|
* <li><a name="nvd"></a>
|
||||||
|
* <p>The Norms data or .nvd file.</p>
|
||||||
|
* <p>For each Norms field, this stores the actual per-document data (the heavy-lifting)</p>
|
||||||
|
* <p>Norms data (.nvd) --> Header,< Data ><sup>NumFields</sup>,Footer</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li>
|
||||||
|
* <li>DocsWithFieldData --> {@link IndexedDISI#writeBitSet Bit set of MaxDoc bits}</li>
|
||||||
|
* <li>NormsData --> {@link DataOutput#writeByte(byte) byte}<sup>NumDocsWithField * BytesPerValue</sup></li>
|
||||||
|
* <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
|
||||||
|
* </ul>
|
||||||
|
* </ol>
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class Lucene80NormsFormat extends NormsFormat {
|
||||||
|
|
||||||
|
/** Sole Constructor */
|
||||||
|
public Lucene80NormsFormat() {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NormsConsumer normsConsumer(SegmentWriteState state) throws IOException {
|
||||||
|
return new Lucene80NormsConsumer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NormsProducer normsProducer(SegmentReadState state) throws IOException {
|
||||||
|
return new Lucene80NormsProducer(state, DATA_CODEC, DATA_EXTENSION, METADATA_CODEC, METADATA_EXTENSION);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final String DATA_CODEC = "Lucene80NormsData";
|
||||||
|
private static final String DATA_EXTENSION = "nvd";
|
||||||
|
private static final String METADATA_CODEC = "Lucene80NormsMetadata";
|
||||||
|
private static final String METADATA_EXTENSION = "nvm";
|
||||||
|
static final int VERSION_START = 0;
|
||||||
|
static final int VERSION_CURRENT = VERSION_START;
|
||||||
|
}
|
|
@ -0,0 +1,386 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene80;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
|
import org.apache.lucene.codecs.NormsProducer;
|
||||||
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
|
import org.apache.lucene.index.DocValues;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.FieldInfos;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.RandomAccessInput;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
import static org.apache.lucene.codecs.lucene80.Lucene80NormsFormat.VERSION_CURRENT;
|
||||||
|
import static org.apache.lucene.codecs.lucene80.Lucene80NormsFormat.VERSION_START;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reader for {@link Lucene80NormsFormat}
|
||||||
|
*/
|
||||||
|
final class Lucene80NormsProducer extends NormsProducer implements Cloneable {
|
||||||
|
// metadata maps (just file pointers and minimal stuff)
|
||||||
|
private final Map<Integer,NormsEntry> norms = new HashMap<>();
|
||||||
|
private final int maxDoc;
|
||||||
|
private IndexInput data;
|
||||||
|
private boolean merging;
|
||||||
|
private Map<Integer, IndexInput> disiInputs;
|
||||||
|
private Map<Integer, RandomAccessInput> disiJumpTables;
|
||||||
|
private Map<Integer, RandomAccessInput> dataInputs;
|
||||||
|
|
||||||
|
Lucene80NormsProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
|
||||||
|
maxDoc = state.segmentInfo.maxDoc();
|
||||||
|
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
|
||||||
|
int version = -1;
|
||||||
|
|
||||||
|
// read in the entries from the metadata file.
|
||||||
|
try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) {
|
||||||
|
Throwable priorE = null;
|
||||||
|
try {
|
||||||
|
version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||||
|
readFields(in, state.fieldInfos);
|
||||||
|
} catch (Throwable exception) {
|
||||||
|
priorE = exception;
|
||||||
|
} finally {
|
||||||
|
CodecUtil.checkFooter(in, priorE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
|
||||||
|
data = state.directory.openInput(dataName, state.context);
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
|
||||||
|
if (version != version2) {
|
||||||
|
throw new CorruptIndexException("Format versions mismatch: meta=" + version + ",data=" + version2, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: data file is too costly to verify checksum against all the bytes on open,
|
||||||
|
// but for now we at least verify proper structure of the checksum footer: which looks
|
||||||
|
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
|
||||||
|
// such as file truncation.
|
||||||
|
CodecUtil.retrieveChecksum(data);
|
||||||
|
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
IOUtils.closeWhileHandlingException(this.data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NormsProducer getMergeInstance() throws IOException {
|
||||||
|
Lucene80NormsProducer clone;
|
||||||
|
try {
|
||||||
|
clone = (Lucene80NormsProducer) super.clone();
|
||||||
|
} catch (CloneNotSupportedException e) {
|
||||||
|
// cannot happen
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
clone.data = data.clone();
|
||||||
|
clone.disiInputs = new HashMap<>();
|
||||||
|
clone.disiJumpTables = new HashMap<>();
|
||||||
|
clone.dataInputs = new HashMap<>();
|
||||||
|
clone.merging = true;
|
||||||
|
return clone;
|
||||||
|
}
|
||||||
|
|
||||||
|
static class NormsEntry {
|
||||||
|
byte denseRankPower;
|
||||||
|
byte bytesPerNorm;
|
||||||
|
long docsWithFieldOffset;
|
||||||
|
long docsWithFieldLength;
|
||||||
|
short jumpTableEntryCount;
|
||||||
|
int numDocsWithField;
|
||||||
|
long normsOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
static abstract class DenseNormsIterator extends NumericDocValues {
|
||||||
|
|
||||||
|
final int maxDoc;
|
||||||
|
int doc = -1;
|
||||||
|
|
||||||
|
DenseNormsIterator(int maxDoc) {
|
||||||
|
this.maxDoc = maxDoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() throws IOException {
|
||||||
|
return advance(doc + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) throws IOException {
|
||||||
|
if (target >= maxDoc) {
|
||||||
|
return doc = NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
return doc = target;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean advanceExact(int target) throws IOException {
|
||||||
|
this.doc = target;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return maxDoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static abstract class SparseNormsIterator extends NumericDocValues {
|
||||||
|
|
||||||
|
final IndexedDISI disi;
|
||||||
|
|
||||||
|
SparseNormsIterator(IndexedDISI disi) {
|
||||||
|
this.disi = disi;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return disi.docID();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() throws IOException {
|
||||||
|
return disi.nextDoc();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) throws IOException {
|
||||||
|
return disi.advance(target);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean advanceExact(int target) throws IOException {
|
||||||
|
return disi.advanceExact(target);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return disi.cost();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readFields(IndexInput meta, FieldInfos infos) throws IOException {
|
||||||
|
for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) {
|
||||||
|
FieldInfo info = infos.fieldInfo(fieldNumber);
|
||||||
|
if (info == null) {
|
||||||
|
throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
|
||||||
|
} else if (!info.hasNorms()) {
|
||||||
|
throw new CorruptIndexException("Invalid field: " + info.name, meta);
|
||||||
|
}
|
||||||
|
NormsEntry entry = new NormsEntry();
|
||||||
|
entry.docsWithFieldOffset = meta.readLong();
|
||||||
|
entry.docsWithFieldLength = meta.readLong();
|
||||||
|
entry.jumpTableEntryCount = meta.readShort();
|
||||||
|
entry.denseRankPower = meta.readByte();
|
||||||
|
entry.numDocsWithField = meta.readInt();
|
||||||
|
entry.bytesPerNorm = meta.readByte();
|
||||||
|
switch (entry.bytesPerNorm) {
|
||||||
|
case 0: case 1: case 2: case 4: case 8:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new CorruptIndexException("Invalid bytesPerValue: " + entry.bytesPerNorm + ", field: " + info.name, meta);
|
||||||
|
}
|
||||||
|
entry.normsOffset = meta.readLong();
|
||||||
|
norms.put(info.number, entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private RandomAccessInput getDataInput(FieldInfo field, NormsEntry entry) throws IOException {
|
||||||
|
RandomAccessInput slice = null;
|
||||||
|
if (merging) {
|
||||||
|
slice = dataInputs.get(field.number);
|
||||||
|
}
|
||||||
|
if (slice == null) {
|
||||||
|
slice = data.randomAccessSlice(entry.normsOffset, entry.numDocsWithField * (long) entry.bytesPerNorm);
|
||||||
|
if (merging) {
|
||||||
|
dataInputs.put(field.number, slice);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return slice;
|
||||||
|
}
|
||||||
|
|
||||||
|
private IndexInput getDisiInput(FieldInfo field, NormsEntry entry) throws IOException {
|
||||||
|
IndexInput slice = null;
|
||||||
|
if (merging) {
|
||||||
|
slice = disiInputs.get(field.number);
|
||||||
|
}
|
||||||
|
if (slice == null) {
|
||||||
|
slice = IndexedDISI.createBlockSlice(
|
||||||
|
data, "docs", entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.jumpTableEntryCount);
|
||||||
|
if (merging) {
|
||||||
|
disiInputs.put(field.number, slice);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return slice;
|
||||||
|
}
|
||||||
|
|
||||||
|
private RandomAccessInput getDisiJumpTable(FieldInfo field, NormsEntry entry) throws IOException {
|
||||||
|
RandomAccessInput jumpTable = null;
|
||||||
|
if (merging) {
|
||||||
|
jumpTable = disiJumpTables.get(field.number);
|
||||||
|
}
|
||||||
|
if (jumpTable == null) {
|
||||||
|
jumpTable = IndexedDISI.createJumpTable(
|
||||||
|
data, entry.docsWithFieldOffset, entry.docsWithFieldLength, entry.jumpTableEntryCount);
|
||||||
|
if (merging) {
|
||||||
|
disiJumpTables.put(field.number, jumpTable);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return jumpTable;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NumericDocValues getNorms(FieldInfo field) throws IOException {
|
||||||
|
final NormsEntry entry = norms.get(field.number);
|
||||||
|
if (entry.docsWithFieldOffset == -2) {
|
||||||
|
// empty
|
||||||
|
return DocValues.emptyNumeric();
|
||||||
|
} else if (entry.docsWithFieldOffset == -1) {
|
||||||
|
// dense
|
||||||
|
if (entry.bytesPerNorm == 0) {
|
||||||
|
return new DenseNormsIterator(maxDoc) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return entry.normsOffset;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
final RandomAccessInput slice = getDataInput(field, entry);
|
||||||
|
switch (entry.bytesPerNorm) {
|
||||||
|
case 1:
|
||||||
|
return new DenseNormsIterator(maxDoc) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return slice.readByte(doc);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
case 2:
|
||||||
|
return new DenseNormsIterator(maxDoc) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return slice.readShort(((long) doc) << 1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
case 4:
|
||||||
|
return new DenseNormsIterator(maxDoc) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return slice.readInt(((long) doc) << 2);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
case 8:
|
||||||
|
return new DenseNormsIterator(maxDoc) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return slice.readLong(((long) doc) << 3);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
default:
|
||||||
|
// should not happen, we already validate bytesPerNorm in readFields
|
||||||
|
throw new AssertionError();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// sparse
|
||||||
|
final IndexInput disiInput = getDisiInput(field, entry);
|
||||||
|
final RandomAccessInput disiJumpTable = getDisiJumpTable(field, entry);
|
||||||
|
final IndexedDISI disi = new IndexedDISI(disiInput, disiJumpTable, entry.jumpTableEntryCount, entry.denseRankPower, entry.numDocsWithField);
|
||||||
|
|
||||||
|
if (entry.bytesPerNorm == 0) {
|
||||||
|
return new SparseNormsIterator(disi) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return entry.normsOffset;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
final RandomAccessInput slice = data.randomAccessSlice(entry.normsOffset, entry.numDocsWithField * (long) entry.bytesPerNorm);
|
||||||
|
switch (entry.bytesPerNorm) {
|
||||||
|
case 1:
|
||||||
|
return new SparseNormsIterator(disi) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return slice.readByte(disi.index());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
case 2:
|
||||||
|
return new SparseNormsIterator(disi) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return slice.readShort(((long) disi.index()) << 1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
case 4:
|
||||||
|
return new SparseNormsIterator(disi) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return slice.readInt(((long) disi.index()) << 2);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
case 8:
|
||||||
|
return new SparseNormsIterator(disi) {
|
||||||
|
@Override
|
||||||
|
public long longValue() throws IOException {
|
||||||
|
return slice.readLong(((long) disi.index()) << 3);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
default:
|
||||||
|
// should not happen, we already validate bytesPerNorm in readFields
|
||||||
|
throw new AssertionError();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
data.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long ramBytesUsed() {
|
||||||
|
return 64L * norms.size(); // good enough
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void checkIntegrity() throws IOException {
|
||||||
|
CodecUtil.checksumEntireFile(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return getClass().getSimpleName() + "(fields=" + norms.size() + ")";
|
||||||
|
}
|
||||||
|
}
|
|
@ -163,7 +163,7 @@
|
||||||
* all documents omit position data.
|
* all documents omit position data.
|
||||||
* </li>
|
* </li>
|
||||||
* <li>
|
* <li>
|
||||||
* {@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Normalization factors}.
|
* {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}.
|
||||||
* For each field in each document, a value is stored
|
* For each field in each document, a value is stored
|
||||||
* that is multiplied into the score for hits on that field.
|
* that is multiplied into the score for hits on that field.
|
||||||
* </li>
|
* </li>
|
||||||
|
@ -175,7 +175,7 @@
|
||||||
* {@link org.apache.lucene.document.Field Field} constructors
|
* {@link org.apache.lucene.document.Field Field} constructors
|
||||||
* </li>
|
* </li>
|
||||||
* <li>
|
* <li>
|
||||||
* {@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-document values}.
|
* {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}.
|
||||||
* Like stored values, these are also keyed by document
|
* Like stored values, these are also keyed by document
|
||||||
* number, but are generally intended to be loaded into main memory for fast
|
* number, but are generally intended to be loaded into main memory for fast
|
||||||
* access. Whereas stored values are generally intended for summary results from
|
* access. Whereas stored values are generally intended for summary results from
|
||||||
|
@ -284,12 +284,12 @@
|
||||||
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
* <td>Stores additional per-position metadata information such as character offsets and user payloads</td>
|
||||||
* </tr>
|
* </tr>
|
||||||
* <tr>
|
* <tr>
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene70.Lucene70NormsFormat Norms}</td>
|
* <td>{@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms}</td>
|
||||||
* <td>.nvd, .nvm</td>
|
* <td>.nvd, .nvm</td>
|
||||||
* <td>Encodes length and boost factors for docs and fields</td>
|
* <td>Encodes length and boost factors for docs and fields</td>
|
||||||
* </tr>
|
* </tr>
|
||||||
* <tr>
|
* <tr>
|
||||||
* <td>{@link org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat Per-Document Values}</td>
|
* <td>{@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values}</td>
|
||||||
* <td>.dvd, .dvm</td>
|
* <td>.dvd, .dvm</td>
|
||||||
* <td>Encodes additional scoring factors or other per-document information.</td>
|
* <td>Encodes additional scoring factors or other per-document information.</td>
|
||||||
* </tr>
|
* </tr>
|
||||||
|
@ -393,7 +393,9 @@
|
||||||
* doc ids, the (term freq, normalization factor) pairs that may trigger the
|
* doc ids, the (term freq, normalization factor) pairs that may trigger the
|
||||||
* maximum score of the block. This information is recorded alongside skip data
|
* maximum score of the block. This information is recorded alongside skip data
|
||||||
* in order to be able to skip blocks of doc ids if they may not produce high
|
* in order to be able to skip blocks of doc ids if they may not produce high
|
||||||
* enough scores.</li>
|
* enough scores.
|
||||||
|
* Additionally doc values and norms has been extended with jump-tables to make access O(1)
|
||||||
|
* instead of O(n), where n is the number of elements to skip when advancing in the data.</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
* <a name="Limitations"></a>
|
* <a name="Limitations"></a>
|
||||||
* <h2>Limitations</h2>
|
* <h2>Limitations</h2>
|
||||||
|
|
|
@ -13,4 +13,4 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat
|
org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat
|
||||||
|
|
|
@ -0,0 +1,522 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene80;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.store.RandomAccessInput;
|
||||||
|
import org.apache.lucene.util.BitSetIterator;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
// Copied from the lucene70 package for separation of codec-code
|
||||||
|
public class TestIndexedDISI extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testEmpty() throws IOException {
|
||||||
|
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// EMPTY blocks are special with regard to jumps as they have size 0
|
||||||
|
public void testEmptyBlocks() throws IOException {
|
||||||
|
final int B = 65536;
|
||||||
|
int maxDoc = B*11;
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
// block 0: EMPTY
|
||||||
|
set.set(B+5); // block 1: SPARSE
|
||||||
|
// block 2: EMPTY
|
||||||
|
// block 3: EMPTY
|
||||||
|
set.set(B*4+5); // block 4: SPARSE
|
||||||
|
|
||||||
|
for (int i = 0 ; i < B ; i++) {
|
||||||
|
set.set(B*6+i); // block 6: ALL
|
||||||
|
}
|
||||||
|
for (int i = 0 ; i < B ; i+=3) {
|
||||||
|
set.set(B*7+i); // block 7: DENSE
|
||||||
|
}
|
||||||
|
for (int i = 0 ; i < B ; i++) {
|
||||||
|
if (i != 32768) {
|
||||||
|
set.set(B*8 + i); // block 8: DENSE (all-1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// block 9-11: EMPTY
|
||||||
|
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
doTestAllSingleJump(set, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Change the first block to DENSE to see if jump-tables sets to position 0
|
||||||
|
set.set(0);
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
doTestAllSingleJump(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// EMPTY blocks are special with regard to jumps as they have size 0
|
||||||
|
public void testLastEmptyBlocks() throws IOException {
|
||||||
|
final int B = 65536;
|
||||||
|
int maxDoc = B*3;
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
for (int docID = 0 ; docID < B*2 ; docID++) { // first 2 blocks are ALL
|
||||||
|
set.set(docID);
|
||||||
|
}
|
||||||
|
// Last block is EMPTY
|
||||||
|
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
doTestAllSingleJump(set, dir);
|
||||||
|
assertAdvanceBeyondEnd(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Checks that advance after the end of the blocks has been reached has the correct behaviour
|
||||||
|
private void assertAdvanceBeyondEnd(FixedBitSet set, Directory dir) throws IOException {
|
||||||
|
final int cardinality = set.cardinality();
|
||||||
|
final byte denseRankPower = 9; // Not tested here so fixed to isolate factors
|
||||||
|
long length;
|
||||||
|
int jumpTableentryCount;
|
||||||
|
try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
|
||||||
|
jumpTableentryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out, denseRankPower);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
|
||||||
|
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||||
|
int doc = disi2.docID();
|
||||||
|
int index = 0;
|
||||||
|
while (doc < cardinality) {
|
||||||
|
doc = disi2.nextDoc();
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexedDISI disi = new IndexedDISI(in, 0L, in.length(), jumpTableentryCount, denseRankPower, cardinality);
|
||||||
|
// Advance 1 docID beyond end
|
||||||
|
assertFalse("There should be no set bit beyond the valid docID range", disi.advanceExact(set.length()));
|
||||||
|
disi.advance(doc); // Should be the special docID signifyin NO_MORE_DOCS from the BitSetIterator
|
||||||
|
assertEquals("The index when advancing beyond the last defined docID should be correct",
|
||||||
|
index, disi.index()+1); // disi.index()+1 as the while-loop also counts the NO_MORE_DOCS
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomBlocks() throws IOException {
|
||||||
|
final int BLOCKS = 5;
|
||||||
|
FixedBitSet set = createSetWithRandomBlocks(BLOCKS);
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
doTestAllSingleJump(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// When doing merges in Lucene80NormsProducer, IndexedDISI are created from slices where the offset is not 0
|
||||||
|
public void testPositionNotZero() throws IOException {
|
||||||
|
final int BLOCKS = 10;
|
||||||
|
final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7)+7); // sane + chance of disable
|
||||||
|
|
||||||
|
FixedBitSet set = createSetWithRandomBlocks(BLOCKS);
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
final int cardinality = set.cardinality();
|
||||||
|
int jumpTableEntryCount;
|
||||||
|
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||||
|
jumpTableEntryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out, denseRankPower);
|
||||||
|
}
|
||||||
|
try (IndexInput fullInput = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||||
|
IndexInput blockData =
|
||||||
|
IndexedDISI.createBlockSlice(fullInput, "blocks", 0, fullInput.length(), jumpTableEntryCount);
|
||||||
|
blockData.seek(random().nextInt((int) blockData.length()));
|
||||||
|
|
||||||
|
RandomAccessInput jumpTable = IndexedDISI.createJumpTable(fullInput, 0, fullInput.length(), jumpTableEntryCount);
|
||||||
|
IndexedDISI disi = new IndexedDISI(blockData, jumpTable, jumpTableEntryCount, denseRankPower, cardinality);
|
||||||
|
// This failed at some point during LUCENE-8585 development as it did not reset the slice position
|
||||||
|
disi.advanceExact(BLOCKS*65536-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private FixedBitSet createSetWithRandomBlocks(int blockCount) {
|
||||||
|
final int B = 65536;
|
||||||
|
FixedBitSet set = new FixedBitSet(blockCount * B);
|
||||||
|
for (int block = 0; block < blockCount; block++) {
|
||||||
|
switch (random().nextInt(4)) {
|
||||||
|
case 0: { // EMPTY
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 1: { // ALL
|
||||||
|
for (int docID = block* B; docID < (block+1)* B; docID++) {
|
||||||
|
set.set(docID);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 2: { // SPARSE ( < 4096 )
|
||||||
|
for (int docID = block* B; docID < (block+1)* B; docID += 101) {
|
||||||
|
set.set(docID);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 3: { // DENSE ( >= 4096 )
|
||||||
|
for (int docID = block* B; docID < (block+1)* B; docID += 3) {
|
||||||
|
set.set(docID);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: throw new IllegalStateException("Modulo logic error: there should only be 4 possibilities");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void doTestAllSingleJump(FixedBitSet set, Directory dir) throws IOException {
|
||||||
|
final int cardinality = set.cardinality();
|
||||||
|
final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7)+7); // sane + chance of disable
|
||||||
|
long length;
|
||||||
|
int jumpTableentryCount;
|
||||||
|
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||||
|
jumpTableentryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out, denseRankPower);
|
||||||
|
length = out.getFilePointer();
|
||||||
|
}
|
||||||
|
|
||||||
|
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||||
|
for (int i = 0; i < set.length(); i++) {
|
||||||
|
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||||
|
assertEquals("The bit at " + i + " should be correct with advanceExact", set.get(i), disi.advanceExact(i));
|
||||||
|
|
||||||
|
IndexedDISI disi2 = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||||
|
disi2.advance(i);
|
||||||
|
// Proper sanity check with jump tables as an error could make them seek backwards
|
||||||
|
assertTrue("The docID should at least be " + i + " after advance(" + i + ") but was " + disi2.docID(),
|
||||||
|
i <= disi2.docID());
|
||||||
|
if (set.get(i)) {
|
||||||
|
assertEquals("The docID should be present with advance", i, disi2.docID());
|
||||||
|
} else {
|
||||||
|
assertNotSame("The docID should not be present with advance", i, disi2.docID());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOneDoc() throws IOException {
|
||||||
|
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
set.set(random().nextInt(maxDoc));
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTwoDocs() throws IOException {
|
||||||
|
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
set.set(random().nextInt(maxDoc));
|
||||||
|
set.set(random().nextInt(maxDoc));
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAllDocs() throws IOException {
|
||||||
|
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
set.set(1, maxDoc);
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testHalfFull() throws IOException {
|
||||||
|
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
for (int i = random().nextInt(2); i < maxDoc; i += TestUtil.nextInt(random(), 1, 3)) {
|
||||||
|
set.set(i);
|
||||||
|
}
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDocRange() throws IOException {
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
for (int iter = 0; iter < 10; ++iter) {
|
||||||
|
int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
final int start = random().nextInt(maxDoc);
|
||||||
|
final int end = TestUtil.nextInt(random(), start + 1, maxDoc);
|
||||||
|
set.set(start, end);
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSparseDenseBoundary() throws IOException {
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
FixedBitSet set = new FixedBitSet(200000);
|
||||||
|
int start = 65536 + random().nextInt(100);
|
||||||
|
final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7)+7); // sane + chance of disable
|
||||||
|
|
||||||
|
// we set MAX_ARRAY_LENGTH bits so the encoding will be sparse
|
||||||
|
set.set(start, start + IndexedDISI.MAX_ARRAY_LENGTH);
|
||||||
|
long length;
|
||||||
|
int jumpTableEntryCount;
|
||||||
|
try (IndexOutput out = dir.createOutput("sparse", IOContext.DEFAULT)) {
|
||||||
|
jumpTableEntryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH), out, denseRankPower);
|
||||||
|
length = out.getFilePointer();
|
||||||
|
}
|
||||||
|
try (IndexInput in = dir.openInput("sparse", IOContext.DEFAULT)) {
|
||||||
|
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableEntryCount, denseRankPower, IndexedDISI.MAX_ARRAY_LENGTH);
|
||||||
|
assertEquals(start, disi.nextDoc());
|
||||||
|
assertEquals(IndexedDISI.Method.SPARSE, disi.method);
|
||||||
|
}
|
||||||
|
doTest(set, dir);
|
||||||
|
|
||||||
|
// now we set one more bit so the encoding will be dense
|
||||||
|
set.set(start + IndexedDISI.MAX_ARRAY_LENGTH + random().nextInt(100));
|
||||||
|
try (IndexOutput out = dir.createOutput("bar", IOContext.DEFAULT)) {
|
||||||
|
IndexedDISI.writeBitSet(new BitSetIterator(set, IndexedDISI.MAX_ARRAY_LENGTH + 1), out, denseRankPower);
|
||||||
|
length = out.getFilePointer();
|
||||||
|
}
|
||||||
|
try (IndexInput in = dir.openInput("bar", IOContext.DEFAULT)) {
|
||||||
|
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableEntryCount, denseRankPower, IndexedDISI.MAX_ARRAY_LENGTH + 1);
|
||||||
|
assertEquals(start, disi.nextDoc());
|
||||||
|
assertEquals(IndexedDISI.Method.DENSE, disi.method);
|
||||||
|
}
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOneDocMissing() throws IOException {
|
||||||
|
int maxDoc = TestUtil.nextInt(random(), 1, 1000000);
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
set.set(0, maxDoc);
|
||||||
|
set.clear(random().nextInt(maxDoc));
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFewMissingDocs() throws IOException {
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
for (int iter = 0; iter < 100; ++iter) {
|
||||||
|
int maxDoc = TestUtil.nextInt(random(), 1, 100000);
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
set.set(0, maxDoc);
|
||||||
|
final int numMissingDocs = TestUtil.nextInt(random(), 2, 1000);
|
||||||
|
for (int i = 0; i < numMissingDocs; ++i) {
|
||||||
|
set.clear(random().nextInt(maxDoc));
|
||||||
|
}
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public void testDenseMultiBlock() throws IOException {
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
int maxDoc = 10 * 65536; // 10 blocks
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
for (int i = 0; i < maxDoc; i += 2) { // Set every other to ensure dense
|
||||||
|
set.set(i);
|
||||||
|
}
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIllegalDenseRankPower() throws IOException {
|
||||||
|
|
||||||
|
// Legal values
|
||||||
|
for (byte denseRankPower: new byte[]{-1, 7, 8, 9, 10, 11, 12, 13, 14, 15}) {
|
||||||
|
createAndOpenDISI(denseRankPower, denseRankPower);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Illegal values
|
||||||
|
for (byte denseRankPower: new byte[]{-2, 0, 1, 6, 16}) {
|
||||||
|
try {
|
||||||
|
createAndOpenDISI(denseRankPower, (byte) 8); // Illegal write, legal read (should not reach read)
|
||||||
|
fail("Trying to create an IndexedDISI data stream with denseRankPower-read " + denseRankPower +
|
||||||
|
" and denseRankPower-write 8 should fail");
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
// Expected
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
createAndOpenDISI((byte) 8, denseRankPower); // Legal write, illegal read (should reach read)
|
||||||
|
fail("Trying to create an IndexedDISI data stream with denseRankPower-write 8 and denseRankPower-read " +
|
||||||
|
denseRankPower + " should fail");
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
// Expected
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createAndOpenDISI(byte denseRankPowerWrite, byte denseRankPowerRead) throws IOException {
|
||||||
|
FixedBitSet set = new FixedBitSet(10);
|
||||||
|
set.set(set.length()-1);
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
long length;
|
||||||
|
int jumpTableEntryCount = -1;
|
||||||
|
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||||
|
jumpTableEntryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, set.cardinality()), out, denseRankPowerWrite);
|
||||||
|
length = out.getFilePointer();
|
||||||
|
}
|
||||||
|
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||||
|
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableEntryCount, denseRankPowerRead, set.cardinality());
|
||||||
|
}
|
||||||
|
// This tests the legality of the denseRankPower only, so we don't do anything with the disi
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOneDocMissingFixed() throws IOException {
|
||||||
|
int maxDoc = 9699;
|
||||||
|
final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7)+7); // sane + chance of disable
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
set.set(0, maxDoc);
|
||||||
|
set.clear(1345);
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
|
||||||
|
final int cardinality = set.cardinality();
|
||||||
|
long length;
|
||||||
|
int jumpTableentryCount;
|
||||||
|
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||||
|
jumpTableentryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out, denseRankPower);
|
||||||
|
length = out.getFilePointer();
|
||||||
|
}
|
||||||
|
|
||||||
|
int step = 16000;
|
||||||
|
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||||
|
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||||
|
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||||
|
assertAdvanceEquality(disi, disi2, step);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandom() throws IOException {
|
||||||
|
try (Directory dir = newDirectory()) {
|
||||||
|
for (int i = 0; i < 10; ++i) {
|
||||||
|
doTestRandom(dir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestRandom(Directory dir) throws IOException {
|
||||||
|
List<Integer> docs = new ArrayList<>();
|
||||||
|
final int maxStep = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 2, 20));
|
||||||
|
final int numDocs = TestUtil.nextInt(random(), 1, Math.min(100000, Integer.MAX_VALUE / maxStep));
|
||||||
|
for (int doc = -1, i = 0; i < numDocs; ++i) {
|
||||||
|
doc += TestUtil.nextInt(random(), 1, maxStep);
|
||||||
|
docs.add(doc);
|
||||||
|
}
|
||||||
|
final int maxDoc = docs.get(docs.size() - 1) + TestUtil.nextInt(random(), 1, 100);
|
||||||
|
|
||||||
|
FixedBitSet set = new FixedBitSet(maxDoc);
|
||||||
|
for (int doc : docs) {
|
||||||
|
set.set(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
doTest(set, dir);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTest(FixedBitSet set, Directory dir) throws IOException {
|
||||||
|
final int cardinality = set.cardinality();
|
||||||
|
final byte denseRankPower = rarely() ? -1 : (byte) (random().nextInt(7)+7); // sane + chance of disable
|
||||||
|
long length;
|
||||||
|
int jumpTableentryCount;
|
||||||
|
try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) {
|
||||||
|
jumpTableentryCount = IndexedDISI.writeBitSet(new BitSetIterator(set, cardinality), out, denseRankPower);
|
||||||
|
length = out.getFilePointer();
|
||||||
|
}
|
||||||
|
|
||||||
|
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||||
|
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||||
|
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||||
|
assertSingleStepEquality(disi, disi2);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int step : new int[] {1, 10, 100, 1000, 10000, 100000}) {
|
||||||
|
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||||
|
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||||
|
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||||
|
assertAdvanceEquality(disi, disi2, step);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int step : new int[] {10, 100, 1000, 10000, 100000}) {
|
||||||
|
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
|
||||||
|
IndexedDISI disi = new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
|
||||||
|
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
|
||||||
|
int disi2length = set.length();
|
||||||
|
assertAdvanceExactRandomized(disi, disi2, disi2length, step);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dir.deleteFile("foo");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertAdvanceExactRandomized(IndexedDISI disi, BitSetIterator disi2, int disi2length, int step)
|
||||||
|
throws IOException {
|
||||||
|
int index = -1;
|
||||||
|
for (int target = 0; target < disi2length; ) {
|
||||||
|
target += TestUtil.nextInt(random(), 0, step);
|
||||||
|
int doc = disi2.docID();
|
||||||
|
while (doc < target) {
|
||||||
|
doc = disi2.nextDoc();
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean exists = disi.advanceExact(target);
|
||||||
|
assertEquals(doc == target, exists);
|
||||||
|
if (exists) {
|
||||||
|
assertEquals(index, disi.index());
|
||||||
|
} else if (random().nextBoolean()) {
|
||||||
|
assertEquals(doc, disi.nextDoc());
|
||||||
|
// This is a bit strange when doc == NO_MORE_DOCS as the index overcounts in the disi2 while-loop
|
||||||
|
assertEquals(index, disi.index());
|
||||||
|
target = doc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertSingleStepEquality(IndexedDISI disi, BitSetIterator disi2) throws IOException {
|
||||||
|
int i = 0;
|
||||||
|
for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {
|
||||||
|
assertEquals(doc, disi.nextDoc());
|
||||||
|
assertEquals(i++, disi.index());
|
||||||
|
}
|
||||||
|
assertEquals(DocIdSetIterator.NO_MORE_DOCS, disi.nextDoc());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertAdvanceEquality(IndexedDISI disi, BitSetIterator disi2, int step) throws IOException {
|
||||||
|
int index = -1;
|
||||||
|
while (true) {
|
||||||
|
int target = disi2.docID() + step;
|
||||||
|
int doc;
|
||||||
|
do {
|
||||||
|
doc = disi2.nextDoc();
|
||||||
|
index++;
|
||||||
|
} while (doc < target);
|
||||||
|
assertEquals(doc, disi.advance(target));
|
||||||
|
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
assertEquals("Expected equality using step " + step + " at docID " + doc, index, disi.index());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,751 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene80;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
import java.util.function.LongSupplier;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.codecs.DocValuesFormat;
|
||||||
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.asserting.AssertingCodec;
|
||||||
|
import org.apache.lucene.document.BinaryDocValuesField;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.NumericDocValuesField;
|
||||||
|
import org.apache.lucene.document.SortedDocValuesField;
|
||||||
|
import org.apache.lucene.document.SortedNumericDocValuesField;
|
||||||
|
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||||
|
import org.apache.lucene.document.StoredField;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.index.BaseCompressingDocValuesFormatTestCase;
|
||||||
|
import org.apache.lucene.index.BinaryDocValues;
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.DocValues;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.index.IndexableField;
|
||||||
|
import org.apache.lucene.index.LeafReader;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.index.SerialMergeScheduler;
|
||||||
|
import org.apache.lucene.index.SortedDocValues;
|
||||||
|
import org.apache.lucene.index.SortedNumericDocValues;
|
||||||
|
import org.apache.lucene.index.SortedSetDocValues;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.Terms;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
|
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.RAMFile;
|
||||||
|
import org.apache.lucene.store.RAMInputStream;
|
||||||
|
import org.apache.lucene.store.RAMOutputStream;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests Lucene80DocValuesFormat
|
||||||
|
* Copied directly from the lucene70 package for separation of codec-code
|
||||||
|
*/
|
||||||
|
public class TestLucene80DocValuesFormat extends BaseCompressingDocValuesFormatTestCase {
|
||||||
|
private final Codec codec = TestUtil.alwaysDocValuesFormat(new Lucene80DocValuesFormat());
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Codec getCodec() {
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: these big methods can easily blow up some of the other ram-hungry codecs...
|
||||||
|
// for now just keep them here, as we want to test this for this format.
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testSortedSetVariableLengthBigVsStoredFields() throws Exception {
|
||||||
|
int numIterations = atLeast(1);
|
||||||
|
for (int i = 0; i < numIterations; i++) {
|
||||||
|
doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16, 100);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nightly
|
||||||
|
public void testSortedSetVariableLengthManyVsStoredFields() throws Exception {
|
||||||
|
int numIterations = atLeast(1);
|
||||||
|
for (int i = 0; i < numIterations; i++) {
|
||||||
|
doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16, 100);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testSortedVariableLengthBigVsStoredFields() throws Exception {
|
||||||
|
int numIterations = atLeast(1);
|
||||||
|
for (int i = 0; i < numIterations; i++) {
|
||||||
|
doTestSortedVsStoredFields(atLeast(300), 1d, 1, 32766);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nightly
|
||||||
|
public void testSortedVariableLengthManyVsStoredFields() throws Exception {
|
||||||
|
int numIterations = atLeast(1);
|
||||||
|
for (int i = 0; i < numIterations; i++) {
|
||||||
|
doTestSortedVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1d, 1, 500);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testTermsEnumFixedWidth() throws Exception {
|
||||||
|
int numIterations = atLeast(1);
|
||||||
|
for (int i = 0; i < numIterations; i++) {
|
||||||
|
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> TestUtil.randomSimpleString(random(), 10, 10));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testTermsEnumVariableWidth() throws Exception {
|
||||||
|
int numIterations = atLeast(1);
|
||||||
|
for (int i = 0; i < numIterations; i++) {
|
||||||
|
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> TestUtil.randomSimpleString(random(), 1, 500));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nightly
|
||||||
|
public void testTermsEnumRandomMany() throws Exception {
|
||||||
|
int numIterations = atLeast(1);
|
||||||
|
for (int i = 0; i < numIterations; i++) {
|
||||||
|
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 8121), () -> TestUtil.randomSimpleString(random(), 1, 500));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTermsEnumLongSharedPrefixes() throws Exception {
|
||||||
|
int numIterations = atLeast(1);
|
||||||
|
for (int i = 0; i < numIterations; i++) {
|
||||||
|
doTestTermsEnumRandom(TestUtil.nextInt(random(), 1025, 5121), () -> {
|
||||||
|
char[] chars = new char[random().nextInt(500)];
|
||||||
|
Arrays.fill(chars, 'a');
|
||||||
|
if (chars.length > 0) {
|
||||||
|
chars[random().nextInt(chars.length)] = 'b';
|
||||||
|
}
|
||||||
|
return new String(chars);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testSparseDocValuesVsStoredFields() throws Exception {
|
||||||
|
int numIterations = atLeast(1);
|
||||||
|
for (int i = 0; i < numIterations; i++) {
|
||||||
|
doTestSparseDocValuesVsStoredFields();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestSparseDocValuesVsStoredFields() throws Exception {
|
||||||
|
final long[] values = new long[TestUtil.nextInt(random(), 1, 500)];
|
||||||
|
for (int i = 0; i < values.length; ++i) {
|
||||||
|
values[i] = random().nextLong();
|
||||||
|
}
|
||||||
|
|
||||||
|
Directory dir = newFSDirectory(createTempDir());
|
||||||
|
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
conf.setMergeScheduler(new SerialMergeScheduler());
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
|
||||||
|
|
||||||
|
// sparse compression is only enabled if less than 1% of docs have a value
|
||||||
|
final int avgGap = 100;
|
||||||
|
|
||||||
|
final int numDocs = atLeast(200);
|
||||||
|
for (int i = random().nextInt(avgGap * 2); i >= 0; --i) {
|
||||||
|
writer.addDocument(new Document());
|
||||||
|
}
|
||||||
|
final int maxNumValuesPerDoc = random().nextBoolean() ? 1 : TestUtil.nextInt(random(), 2, 5);
|
||||||
|
for (int i = 0; i < numDocs; ++i) {
|
||||||
|
Document doc = new Document();
|
||||||
|
|
||||||
|
// single-valued
|
||||||
|
long docValue = values[random().nextInt(values.length)];
|
||||||
|
doc.add(new NumericDocValuesField("numeric", docValue));
|
||||||
|
doc.add(new SortedDocValuesField("sorted", new BytesRef(Long.toString(docValue))));
|
||||||
|
doc.add(new BinaryDocValuesField("binary", new BytesRef(Long.toString(docValue))));
|
||||||
|
doc.add(new StoredField("value", docValue));
|
||||||
|
|
||||||
|
// multi-valued
|
||||||
|
final int numValues = TestUtil.nextInt(random(), 1, maxNumValuesPerDoc);
|
||||||
|
for (int j = 0; j < numValues; ++j) {
|
||||||
|
docValue = values[random().nextInt(values.length)];
|
||||||
|
doc.add(new SortedNumericDocValuesField("sorted_numeric", docValue));
|
||||||
|
doc.add(new SortedSetDocValuesField("sorted_set", new BytesRef(Long.toString(docValue))));
|
||||||
|
doc.add(new StoredField("values", docValue));
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
// add a gap
|
||||||
|
for (int j = TestUtil.nextInt(random(), 0, avgGap * 2); j >= 0; --j) {
|
||||||
|
writer.addDocument(new Document());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
writer.forceMerge(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
final IndexReader indexReader = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
for (LeafReaderContext context : indexReader.leaves()) {
|
||||||
|
final LeafReader reader = context.reader();
|
||||||
|
final NumericDocValues numeric = DocValues.getNumeric(reader, "numeric");
|
||||||
|
|
||||||
|
final SortedDocValues sorted = DocValues.getSorted(reader, "sorted");
|
||||||
|
|
||||||
|
final BinaryDocValues binary = DocValues.getBinary(reader, "binary");
|
||||||
|
|
||||||
|
final SortedNumericDocValues sortedNumeric = DocValues.getSortedNumeric(reader, "sorted_numeric");
|
||||||
|
|
||||||
|
final SortedSetDocValues sortedSet = DocValues.getSortedSet(reader, "sorted_set");
|
||||||
|
|
||||||
|
for (int i = 0; i < reader.maxDoc(); ++i) {
|
||||||
|
final Document doc = reader.document(i);
|
||||||
|
final IndexableField valueField = doc.getField("value");
|
||||||
|
final Long value = valueField == null ? null : valueField.numericValue().longValue();
|
||||||
|
|
||||||
|
if (value == null) {
|
||||||
|
assertTrue(numeric.docID() + " vs " + i, numeric.docID() < i);
|
||||||
|
} else {
|
||||||
|
assertEquals(i, numeric.nextDoc());
|
||||||
|
assertEquals(i, binary.nextDoc());
|
||||||
|
assertEquals(i, sorted.nextDoc());
|
||||||
|
assertEquals(value.longValue(), numeric.longValue());
|
||||||
|
assertTrue(sorted.ordValue() >= 0);
|
||||||
|
assertEquals(new BytesRef(Long.toString(value)), sorted.lookupOrd(sorted.ordValue()));
|
||||||
|
assertEquals(new BytesRef(Long.toString(value)), binary.binaryValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
final IndexableField[] valuesFields = doc.getFields("values");
|
||||||
|
if (valuesFields.length == 0) {
|
||||||
|
assertTrue(sortedNumeric.docID() + " vs " + i, sortedNumeric.docID() < i);
|
||||||
|
} else {
|
||||||
|
final Set<Long> valueSet = new HashSet<>();
|
||||||
|
for (IndexableField sf : valuesFields) {
|
||||||
|
valueSet.add(sf.numericValue().longValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(i, sortedNumeric.nextDoc());
|
||||||
|
assertEquals(valuesFields.length, sortedNumeric.docValueCount());
|
||||||
|
for (int j = 0; j < sortedNumeric.docValueCount(); ++j) {
|
||||||
|
assertTrue(valueSet.contains(sortedNumeric.nextValue()));
|
||||||
|
}
|
||||||
|
assertEquals(i, sortedSet.nextDoc());
|
||||||
|
int sortedSetCount = 0;
|
||||||
|
while (true) {
|
||||||
|
long ord = sortedSet.nextOrd();
|
||||||
|
if (ord == SortedSetDocValues.NO_MORE_ORDS) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
assertTrue(valueSet.contains(Long.parseLong(sortedSet.lookupOrd(ord).utf8ToString())));
|
||||||
|
sortedSetCount++;
|
||||||
|
}
|
||||||
|
assertEquals(valueSet.size(), sortedSetCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
indexReader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: try to refactor this and some termsenum tests into the base class.
|
||||||
|
// to do this we need to fix the test class to get a DVF not a Codec so we can setup
|
||||||
|
// the postings format correctly.
|
||||||
|
private void doTestTermsEnumRandom(int numDocs, Supplier<String> valuesProducer) throws Exception {
|
||||||
|
Directory dir = newFSDirectory(createTempDir());
|
||||||
|
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
conf.setMergeScheduler(new SerialMergeScheduler());
|
||||||
|
// set to duel against a codec which has ordinals:
|
||||||
|
final PostingsFormat pf = TestUtil.getPostingsFormatWithOrds(random());
|
||||||
|
final DocValuesFormat dv = new Lucene80DocValuesFormat();
|
||||||
|
conf.setCodec(new AssertingCodec() {
|
||||||
|
@Override
|
||||||
|
public PostingsFormat getPostingsFormatForField(String field) {
|
||||||
|
return pf;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||||
|
return dv;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
|
||||||
|
|
||||||
|
// index some docs
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
|
||||||
|
doc.add(idField);
|
||||||
|
int numValues = random().nextInt(17);
|
||||||
|
// create a random list of strings
|
||||||
|
List<String> values = new ArrayList<>();
|
||||||
|
for (int v = 0; v < numValues; v++) {
|
||||||
|
values.add(valuesProducer.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
// add in any order to the indexed field
|
||||||
|
ArrayList<String> unordered = new ArrayList<>(values);
|
||||||
|
Collections.shuffle(unordered, random());
|
||||||
|
for (String v : values) {
|
||||||
|
doc.add(newStringField("indexed", v, Field.Store.NO));
|
||||||
|
}
|
||||||
|
|
||||||
|
// add in any order to the dv field
|
||||||
|
ArrayList<String> unordered2 = new ArrayList<>(values);
|
||||||
|
Collections.shuffle(unordered2, random());
|
||||||
|
for (String v : unordered2) {
|
||||||
|
doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.addDocument(doc);
|
||||||
|
if (random().nextInt(31) == 0) {
|
||||||
|
writer.commit();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// delete some docs
|
||||||
|
int numDeletions = random().nextInt(numDocs/10);
|
||||||
|
for (int i = 0; i < numDeletions; i++) {
|
||||||
|
int id = random().nextInt(numDocs);
|
||||||
|
writer.deleteDocuments(new Term("id", Integer.toString(id)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// compare per-segment
|
||||||
|
DirectoryReader ir = writer.getReader();
|
||||||
|
for (LeafReaderContext context : ir.leaves()) {
|
||||||
|
LeafReader r = context.reader();
|
||||||
|
Terms terms = r.terms("indexed");
|
||||||
|
if (terms != null) {
|
||||||
|
SortedSetDocValues ssdv = r.getSortedSetDocValues("dv");
|
||||||
|
assertEquals(terms.size(), ssdv.getValueCount());
|
||||||
|
TermsEnum expected = terms.iterator();
|
||||||
|
TermsEnum actual = r.getSortedSetDocValues("dv").termsEnum();
|
||||||
|
assertEquals(terms.size(), expected, actual);
|
||||||
|
|
||||||
|
doTestSortedSetEnumAdvanceIndependently(ssdv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ir.close();
|
||||||
|
|
||||||
|
writer.forceMerge(1);
|
||||||
|
|
||||||
|
// now compare again after the merge
|
||||||
|
ir = writer.getReader();
|
||||||
|
LeafReader ar = getOnlyLeafReader(ir);
|
||||||
|
Terms terms = ar.terms("indexed");
|
||||||
|
if (terms != null) {
|
||||||
|
assertEquals(terms.size(), ar.getSortedSetDocValues("dv").getValueCount());
|
||||||
|
TermsEnum expected = terms.iterator();
|
||||||
|
TermsEnum actual = ar.getSortedSetDocValues("dv").termsEnum();
|
||||||
|
assertEquals(terms.size(), expected, actual);
|
||||||
|
}
|
||||||
|
ir.close();
|
||||||
|
|
||||||
|
writer.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception {
|
||||||
|
BytesRef ref;
|
||||||
|
|
||||||
|
// sequential next() through all terms
|
||||||
|
while ((ref = expected.next()) != null) {
|
||||||
|
assertEquals(ref, actual.next());
|
||||||
|
assertEquals(expected.ord(), actual.ord());
|
||||||
|
assertEquals(expected.term(), actual.term());
|
||||||
|
}
|
||||||
|
assertNull(actual.next());
|
||||||
|
|
||||||
|
// sequential seekExact(ord) through all terms
|
||||||
|
for (long i = 0; i < numOrds; i++) {
|
||||||
|
expected.seekExact(i);
|
||||||
|
actual.seekExact(i);
|
||||||
|
assertEquals(expected.ord(), actual.ord());
|
||||||
|
assertEquals(expected.term(), actual.term());
|
||||||
|
}
|
||||||
|
|
||||||
|
// sequential seekExact(BytesRef) through all terms
|
||||||
|
for (long i = 0; i < numOrds; i++) {
|
||||||
|
expected.seekExact(i);
|
||||||
|
assertTrue(actual.seekExact(expected.term()));
|
||||||
|
assertEquals(expected.ord(), actual.ord());
|
||||||
|
assertEquals(expected.term(), actual.term());
|
||||||
|
}
|
||||||
|
|
||||||
|
// sequential seekCeil(BytesRef) through all terms
|
||||||
|
for (long i = 0; i < numOrds; i++) {
|
||||||
|
expected.seekExact(i);
|
||||||
|
assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term()));
|
||||||
|
assertEquals(expected.ord(), actual.ord());
|
||||||
|
assertEquals(expected.term(), actual.term());
|
||||||
|
}
|
||||||
|
|
||||||
|
// random seekExact(ord)
|
||||||
|
for (long i = 0; i < numOrds; i++) {
|
||||||
|
long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1);
|
||||||
|
expected.seekExact(randomOrd);
|
||||||
|
actual.seekExact(randomOrd);
|
||||||
|
assertEquals(expected.ord(), actual.ord());
|
||||||
|
assertEquals(expected.term(), actual.term());
|
||||||
|
}
|
||||||
|
|
||||||
|
// random seekExact(BytesRef)
|
||||||
|
for (long i = 0; i < numOrds; i++) {
|
||||||
|
long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1);
|
||||||
|
expected.seekExact(randomOrd);
|
||||||
|
actual.seekExact(expected.term());
|
||||||
|
assertEquals(expected.ord(), actual.ord());
|
||||||
|
assertEquals(expected.term(), actual.term());
|
||||||
|
}
|
||||||
|
|
||||||
|
// random seekCeil(BytesRef)
|
||||||
|
for (long i = 0; i < numOrds; i++) {
|
||||||
|
BytesRef target = new BytesRef(TestUtil.randomUnicodeString(random()));
|
||||||
|
SeekStatus expectedStatus = expected.seekCeil(target);
|
||||||
|
assertEquals(expectedStatus, actual.seekCeil(target));
|
||||||
|
if (expectedStatus != SeekStatus.END) {
|
||||||
|
assertEquals(expected.ord(), actual.ord());
|
||||||
|
assertEquals(expected.term(), actual.term());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testSortedSetAroundBlockSize() throws IOException {
|
||||||
|
final int frontier = 1 << Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||||
|
for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
|
||||||
|
final Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
|
||||||
|
RAMFile buffer = new RAMFile();
|
||||||
|
RAMOutputStream out = new RAMOutputStream(buffer, false);
|
||||||
|
Document doc = new Document();
|
||||||
|
SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
|
||||||
|
doc.add(field1);
|
||||||
|
SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
|
||||||
|
doc.add(field2);
|
||||||
|
for (int i = 0; i < maxDoc; ++i) {
|
||||||
|
BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
|
||||||
|
BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
|
||||||
|
field1.setBytesValue(s1);
|
||||||
|
field2.setBytesValue(s2);
|
||||||
|
w.addDocument(doc);
|
||||||
|
Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
|
||||||
|
out.writeVInt(set.size());
|
||||||
|
for (BytesRef ref : set) {
|
||||||
|
out.writeVInt(ref.length);
|
||||||
|
out.writeBytes(ref.bytes, ref.offset, ref.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.close();
|
||||||
|
w.forceMerge(1);
|
||||||
|
DirectoryReader r = DirectoryReader.open(w);
|
||||||
|
w.close();
|
||||||
|
LeafReader sr = getOnlyLeafReader(r);
|
||||||
|
assertEquals(maxDoc, sr.maxDoc());
|
||||||
|
SortedSetDocValues values = sr.getSortedSetDocValues("sset");
|
||||||
|
assertNotNull(values);
|
||||||
|
try (RAMInputStream in = new RAMInputStream("", buffer)) {
|
||||||
|
BytesRefBuilder b = new BytesRefBuilder();
|
||||||
|
for (int i = 0; i < maxDoc; ++i) {
|
||||||
|
assertEquals(i, values.nextDoc());
|
||||||
|
final int numValues = in.readVInt();
|
||||||
|
|
||||||
|
for (int j = 0; j < numValues; ++j) {
|
||||||
|
b.setLength(in.readVInt());
|
||||||
|
b.grow(b.length());
|
||||||
|
in.readBytes(b.bytes(), 0, b.length());
|
||||||
|
assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
r.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testSortedNumericAroundBlockSize() throws IOException {
|
||||||
|
final int frontier = 1 << Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||||
|
for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
|
||||||
|
final Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
|
||||||
|
RAMFile buffer = new RAMFile();
|
||||||
|
RAMOutputStream out = new RAMOutputStream(buffer, false);
|
||||||
|
Document doc = new Document();
|
||||||
|
SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("snum", 0L);
|
||||||
|
doc.add(field1);
|
||||||
|
SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("snum", 0L);
|
||||||
|
doc.add(field2);
|
||||||
|
for (int i = 0; i < maxDoc; ++i) {
|
||||||
|
long s1 = random().nextInt(100);
|
||||||
|
long s2 = random().nextInt(100);
|
||||||
|
field1.setLongValue(s1);
|
||||||
|
field2.setLongValue(s2);
|
||||||
|
w.addDocument(doc);
|
||||||
|
out.writeVLong(Math.min(s1, s2));
|
||||||
|
out.writeVLong(Math.max(s1, s2));
|
||||||
|
}
|
||||||
|
out.close();
|
||||||
|
w.forceMerge(1);
|
||||||
|
DirectoryReader r = DirectoryReader.open(w);
|
||||||
|
w.close();
|
||||||
|
LeafReader sr = getOnlyLeafReader(r);
|
||||||
|
assertEquals(maxDoc, sr.maxDoc());
|
||||||
|
SortedNumericDocValues values = sr.getSortedNumericDocValues("snum");
|
||||||
|
assertNotNull(values);
|
||||||
|
try (RAMInputStream in = new RAMInputStream("", buffer)) {
|
||||||
|
for (int i = 0; i < maxDoc; ++i) {
|
||||||
|
assertEquals(i, values.nextDoc());
|
||||||
|
assertEquals(2, values.docValueCount());
|
||||||
|
assertEquals(in.readVLong(), values.nextValue());
|
||||||
|
assertEquals(in.readVLong(), values.nextValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
r.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testSortedNumericBlocksOfVariousBitsPerValue() throws Exception {
|
||||||
|
doTestSortedNumericBlocksOfVariousBitsPerValue(() -> TestUtil.nextInt(random(), 1, 3));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testSparseSortedNumericBlocksOfVariousBitsPerValue() throws Exception {
|
||||||
|
doTestSortedNumericBlocksOfVariousBitsPerValue(() -> TestUtil.nextInt(random(), 0, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testNumericBlocksOfVariousBitsPerValue() throws Exception {
|
||||||
|
doTestSparseNumericBlocksOfVariousBitsPerValue(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Slow
|
||||||
|
public void testSparseNumericBlocksOfVariousBitsPerValue() throws Exception {
|
||||||
|
doTestSparseNumericBlocksOfVariousBitsPerValue(random().nextDouble());
|
||||||
|
}
|
||||||
|
|
||||||
|
// The LUCENE-8585 jump-tables enables O(1) skipping of IndexedDISI blocks, DENSE block lookup
|
||||||
|
// and numeric multi blocks. This test focuses on testing these jumps.
|
||||||
|
@Slow
|
||||||
|
public void testNumericFieldJumpTables() throws Exception {
|
||||||
|
// IndexedDISI block skipping only activated if target >= current+2, so we need at least 5 blocks to
|
||||||
|
// trigger consecutive block skips
|
||||||
|
final int maxDoc = atLeast(5*65536);
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter iw = createFastIndexWriter(dir, maxDoc);
|
||||||
|
|
||||||
|
Field idField = newStringField("id", "", Field.Store.NO);
|
||||||
|
Field storedField = newStringField("stored", "", Field.Store.YES);
|
||||||
|
Field dvField = new NumericDocValuesField("dv", 0);
|
||||||
|
|
||||||
|
for (int i = 0 ; i < maxDoc ; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
idField.setStringValue(Integer.toBinaryString(i));
|
||||||
|
doc.add(idField);
|
||||||
|
if (random().nextInt(100) > 10) { // Skip 10% to make DENSE blocks
|
||||||
|
int value = random().nextInt(100000);
|
||||||
|
storedField.setStringValue(Integer.toString(value));
|
||||||
|
doc.add(storedField);
|
||||||
|
dvField.setLongValue(value);
|
||||||
|
doc.add(dvField);
|
||||||
|
}
|
||||||
|
iw.addDocument(doc);
|
||||||
|
}
|
||||||
|
iw.flush();
|
||||||
|
iw.forceMerge(1, true); // Single segment to force large enough structures
|
||||||
|
iw.commit();
|
||||||
|
iw.close();
|
||||||
|
|
||||||
|
assertDVIterate(dir);
|
||||||
|
assertDVAdvance(dir, rarely() ? 1 : 7); // 1 is heavy (~20 s), so we do it rarely. 7 is a lot faster (8 s)
|
||||||
|
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private IndexWriter createFastIndexWriter(Directory dir, int maxBufferedDocs) throws IOException {
|
||||||
|
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
conf.setMaxBufferedDocs(maxBufferedDocs);
|
||||||
|
conf.setRAMBufferSizeMB(-1);
|
||||||
|
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
|
||||||
|
return new IndexWriter(dir, conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static LongSupplier blocksOfVariousBPV() {
|
||||||
|
final long mul = TestUtil.nextInt(random(), 1, 100);
|
||||||
|
final long min = random().nextInt();
|
||||||
|
return new LongSupplier() {
|
||||||
|
int i = Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE;
|
||||||
|
int maxDelta;
|
||||||
|
@Override
|
||||||
|
public long getAsLong() {
|
||||||
|
if (i == Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE) {
|
||||||
|
maxDelta = 1 << random().nextInt(5);
|
||||||
|
i = 0;
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
return min + mul * random().nextInt(maxDelta);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestSortedNumericBlocksOfVariousBitsPerValue(LongSupplier counts) throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
|
||||||
|
conf.setRAMBufferSizeMB(-1);
|
||||||
|
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
|
||||||
|
IndexWriter writer = new IndexWriter(dir, conf);
|
||||||
|
|
||||||
|
final int numDocs = atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
|
||||||
|
final LongSupplier values = blocksOfVariousBPV();
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
|
||||||
|
int valueCount = (int) counts.getAsLong();
|
||||||
|
long valueArray[] = new long[valueCount];
|
||||||
|
for (int j = 0; j < valueCount; j++) {
|
||||||
|
long value = values.getAsLong();
|
||||||
|
valueArray[j] = value;
|
||||||
|
doc.add(new SortedNumericDocValuesField("dv", value));
|
||||||
|
}
|
||||||
|
Arrays.sort(valueArray);
|
||||||
|
for (int j = 0; j < valueCount; j++) {
|
||||||
|
doc.add(new StoredField("stored", Long.toString(valueArray[j])));
|
||||||
|
}
|
||||||
|
writer.addDocument(doc);
|
||||||
|
if (random().nextInt(31) == 0) {
|
||||||
|
writer.commit();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
writer.forceMerge(1);
|
||||||
|
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
// compare
|
||||||
|
DirectoryReader ir = DirectoryReader.open(dir);
|
||||||
|
TestUtil.checkReader(ir);
|
||||||
|
for (LeafReaderContext context : ir.leaves()) {
|
||||||
|
LeafReader r = context.reader();
|
||||||
|
SortedNumericDocValues docValues = DocValues.getSortedNumeric(r, "dv");
|
||||||
|
for (int i = 0; i < r.maxDoc(); i++) {
|
||||||
|
if (i > docValues.docID()) {
|
||||||
|
docValues.nextDoc();
|
||||||
|
}
|
||||||
|
String expected[] = r.document(i).getValues("stored");
|
||||||
|
if (i < docValues.docID()) {
|
||||||
|
assertEquals(0, expected.length);
|
||||||
|
} else {
|
||||||
|
String actual[] = new String[docValues.docValueCount()];
|
||||||
|
for (int j = 0; j < actual.length; j++) {
|
||||||
|
actual[j] = Long.toString(docValues.nextValue());
|
||||||
|
}
|
||||||
|
assertArrayEquals(expected, actual);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ir.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestSparseNumericBlocksOfVariousBitsPerValue(double density) throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
conf.setMaxBufferedDocs(atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE));
|
||||||
|
conf.setRAMBufferSizeMB(-1);
|
||||||
|
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
|
||||||
|
IndexWriter writer = new IndexWriter(dir, conf);
|
||||||
|
Document doc = new Document();
|
||||||
|
Field storedField = newStringField("stored", "", Field.Store.YES);
|
||||||
|
Field dvField = new NumericDocValuesField("dv", 0);
|
||||||
|
doc.add(storedField);
|
||||||
|
doc.add(dvField);
|
||||||
|
|
||||||
|
final int numDocs = atLeast(Lucene80DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
|
||||||
|
final LongSupplier longs = blocksOfVariousBPV();
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
if (random().nextDouble() > density) {
|
||||||
|
writer.addDocument(new Document());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
long value = longs.getAsLong();
|
||||||
|
storedField.setStringValue(Long.toString(value));
|
||||||
|
dvField.setLongValue(value);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.forceMerge(1);
|
||||||
|
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
// compare
|
||||||
|
assertDVIterate(dir);
|
||||||
|
assertDVAdvance(dir, 1); // Tests all jump-lengths from 1 to maxDoc (quite slow ~= 1 minute for 200K docs)
|
||||||
|
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tests that advanceExact does not change the outcome
|
||||||
|
private void assertDVAdvance(Directory dir, int jumpStep) throws IOException {
|
||||||
|
DirectoryReader ir = DirectoryReader.open(dir);
|
||||||
|
TestUtil.checkReader(ir);
|
||||||
|
for (LeafReaderContext context : ir.leaves()) {
|
||||||
|
LeafReader r = context.reader();
|
||||||
|
|
||||||
|
|
||||||
|
for (int jump = jumpStep; jump < r.maxDoc(); jump += jumpStep) {
|
||||||
|
// Create a new instance each time to ensure jumps from the beginning
|
||||||
|
NumericDocValues docValues = DocValues.getNumeric(r, "dv");
|
||||||
|
for (int docID = 0; docID < r.maxDoc(); docID += jump) {
|
||||||
|
String base = "document #" + docID + "/" + r.maxDoc() + ", jumping " + jump + " from #" + (docID-jump);
|
||||||
|
String storedValue = r.document(docID).get("stored");
|
||||||
|
if (storedValue == null) {
|
||||||
|
assertFalse("There should be no DocValue for " + base,
|
||||||
|
docValues.advanceExact(docID));
|
||||||
|
} else {
|
||||||
|
assertTrue("There should be a DocValue for " + base,
|
||||||
|
docValues.advanceExact(docID));
|
||||||
|
assertEquals("The doc value should be correct for " + base,
|
||||||
|
Long.parseLong(storedValue), docValues.longValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene80;
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.index.BaseNormsFormatTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests Lucene80NormsFormat
|
||||||
|
*/
|
||||||
|
public class TestLucene80NormsFormat extends BaseNormsFormatTestCase {
|
||||||
|
private final Codec codec = new Lucene80Codec();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Codec getCodec() {
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
}
|
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
/** Tests helper methods in DocValues */
|
/** Tests helper methods in DocValues */
|
||||||
public class TestDocValues extends LuceneTestCase {
|
public class TestDocValues extends LuceneTestCase {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If the field doesn't exist, we return empty instances:
|
* If the field doesn't exist, we return empty instances:
|
||||||
* it can easily happen that a segment just doesn't have any docs with the field.
|
* it can easily happen that a segment just doesn't have any docs with the field.
|
||||||
|
@ -123,8 +123,8 @@ public class TestDocValues extends LuceneTestCase {
|
||||||
iw.close();
|
iw.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* field with binary docvalues
|
* field with binary docvalues
|
||||||
*/
|
*/
|
||||||
public void testBinaryField() throws Exception {
|
public void testBinaryField() throws Exception {
|
||||||
|
|
|
@ -1204,6 +1204,9 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doTestNumericsVsStoredFields(double density, LongSupplier longs) throws Exception {
|
private void doTestNumericsVsStoredFields(double density, LongSupplier longs) throws Exception {
|
||||||
|
doTestNumericsVsStoredFields(density, longs, 256);
|
||||||
|
}
|
||||||
|
private void doTestNumericsVsStoredFields(double density, LongSupplier longs, int minDocs) throws Exception {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
|
||||||
|
@ -1216,7 +1219,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
|
||||||
doc.add(dvField);
|
doc.add(dvField);
|
||||||
|
|
||||||
// index some docs
|
// index some docs
|
||||||
int numDocs = atLeast(300);
|
int numDocs = atLeast((int) (minDocs*1.172));
|
||||||
// numDocs should be always > 256 so that in case of a codec that optimizes
|
// numDocs should be always > 256 so that in case of a codec that optimizes
|
||||||
// for numbers of values <= 256, all storage layouts are tested
|
// for numbers of values <= 256, all storage layouts are tested
|
||||||
assert numDocs > 256;
|
assert numDocs > 256;
|
||||||
|
@ -1243,12 +1246,17 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
|
||||||
}
|
}
|
||||||
|
|
||||||
// merge some segments and ensure that at least one of them has more than
|
// merge some segments and ensure that at least one of them has more than
|
||||||
// 256 values
|
// max(256, minDocs) values
|
||||||
writer.forceMerge(numDocs / 256);
|
writer.forceMerge(numDocs / Math.max(256, minDocs));
|
||||||
|
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
// compare
|
// compare
|
||||||
|
assertDVIterate(dir);
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Asserts equality of stored value vs. DocValue by iterating DocValues one at a time
|
||||||
|
protected void assertDVIterate(Directory dir) throws IOException {
|
||||||
DirectoryReader ir = DirectoryReader.open(dir);
|
DirectoryReader ir = DirectoryReader.open(dir);
|
||||||
TestUtil.checkReader(ir);
|
TestUtil.checkReader(ir);
|
||||||
for (LeafReaderContext context : ir.leaves()) {
|
for (LeafReaderContext context : ir.leaves()) {
|
||||||
|
@ -1268,9 +1276,8 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
|
||||||
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID());
|
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID());
|
||||||
}
|
}
|
||||||
ir.close();
|
ir.close();
|
||||||
dir.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doTestSortedNumericsVsStoredFields(LongSupplier counts, LongSupplier values) throws Exception {
|
private void doTestSortedNumericsVsStoredFields(LongSupplier counts, LongSupplier values) throws Exception {
|
||||||
Directory dir = newDirectory();
|
Directory dir = newDirectory();
|
||||||
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
|
|
@ -53,7 +53,7 @@ import org.apache.lucene.codecs.asserting.AssertingCodec;
|
||||||
import org.apache.lucene.codecs.blockterms.LuceneFixedGap;
|
import org.apache.lucene.codecs.blockterms.LuceneFixedGap;
|
||||||
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
|
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
|
||||||
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
|
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
|
||||||
import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat;
|
import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
|
||||||
import org.apache.lucene.codecs.lucene80.Lucene80Codec;
|
import org.apache.lucene.codecs.lucene80.Lucene80Codec;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||||
|
@ -930,7 +930,7 @@ public final class TestUtil {
|
||||||
* Returns the actual default docvalues format (e.g. LuceneMNDocValuesFormat for this version of Lucene.
|
* Returns the actual default docvalues format (e.g. LuceneMNDocValuesFormat for this version of Lucene.
|
||||||
*/
|
*/
|
||||||
public static DocValuesFormat getDefaultDocValuesFormat() {
|
public static DocValuesFormat getDefaultDocValuesFormat() {
|
||||||
return new Lucene70DocValuesFormat();
|
return new Lucene80DocValuesFormat();
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: generalize all 'test-checks-for-crazy-codecs' to
|
// TODO: generalize all 'test-checks-for-crazy-codecs' to
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
<fieldType name="string_direct" class="solr.StrField" postingsFormat="Direct" docValuesFormat="Direct"/>
|
<fieldType name="string_direct" class="solr.StrField" postingsFormat="Direct" docValuesFormat="Direct"/>
|
||||||
<fieldType name="string_standard" class="solr.StrField" postingsFormat="Lucene50"/>
|
<fieldType name="string_standard" class="solr.StrField" postingsFormat="Lucene50"/>
|
||||||
|
|
||||||
<fieldType name="string_disk" class="solr.StrField" docValuesFormat="Lucene70"/>
|
<fieldType name="string_disk" class="solr.StrField" docValuesFormat="Lucene80"/>
|
||||||
|
|
||||||
<fieldType name="string" class="solr.StrField"/>
|
<fieldType name="string" class="solr.StrField"/>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue