1
0
mirror of https://github.com/apache/lucene.git synced 2025-02-27 21:09:19 +00:00

LUCENE-10062: Switch to numeric doc values for encoding taxonomy ordinals

This commit is contained in:
Greg Miller 2021-11-19 13:11:42 -08:00 committed by GitHub
parent 6b99f03cdd
commit 0ba310782f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 992 additions and 189 deletions

@ -367,6 +367,9 @@ Improvements
See release notes. https://github.com/locationtech/spatial4j/releases/tag/spatial4j-0.8
(David Smiley)
* LUCENE-10062: Switch taxonomy faceting to use numeric doc values for storing ordinals instead of binary doc values
with its own custom encoding. (Greg Miller)
Bug fixes
---------------------

@ -450,3 +450,17 @@ structure. Use a standard BoostQuery here instead.
Rather than using `setSort()` to change sort values, you should instead create
a new Sort instance with the new values.
## Taxonomy-based faceting uses more modern encodings (LUCENE-9450, LUCENE-10062, LUCENE-10122)
The side-car taxonomy index now uses doc values for ord-to-path lookup (LUCENE-9450) and parent
lookup (LUCENE-10122) instead of stored fields and positions (respectively). Document ordinals
are now encoded with `SortedNumericDocValues` instead of using a custom (v-int) binary format.
Performance gains have been observed with these encoding changes, but to benefit from them, users
must create a new index using 9.x (it is not sufficient to reindex documents against an existing
8.x index). In order to remain backwards-compatible with 8.x indexes, the older format is retained
until a full rebuild is done.
Additionally, `OrdinalsReader` (and sub-classes) have been marked `@Deprecated` as custom binary
encodings will not be supported for Document ordinals in 9.x onwards (`SortedNumericDocValues` are
used out-of-the-box instead).

@ -18,8 +18,15 @@
package org.apache.lucene.facet;
import java.io.IOException;
import java.util.function.BiConsumer;
import org.apache.lucene.facet.taxonomy.BackCompatSortedNumericDocValues;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
/**
* Utility class with a single method for getting a DocIdSetIterator that skips deleted docs
@ -81,4 +88,47 @@ public final class FacetUtils {
}
};
}
/**
* Loads ordinal values as {@link SortedNumericDocValues}. If the index still uses the older
* binary format, it will wrap that with the SNDV API. Newer format indexes will just load the
* SNDV directly.
*
* <p>This is really only needed/useful to maintain back-compat with the binary format. Once
* back-compat is no longer needed, the SNDV field should just be loaded directly.
*
* @deprecated Please do not rely on this method. It is added as a temporary measure for providing
* index backwards-compatibility with Lucene 8 and earlier indexes, and will be removed in
* Lucene 10.
*/
@Deprecated
public static SortedNumericDocValues loadOrdinalValues(LeafReader reader, String fieldName)
throws IOException {
return loadOrdinalValues(reader, fieldName, null);
}
/**
* Loads ordinal values as {@link SortedNumericDocValues}. If the index still uses the older
* binary format, it will wrap that with the SNDV API. Newer format indexes will just load the
* SNDV directly. The provided {@code binaryValueDecoder} allows custom decoding logic for older
* binary format fields to be provided.
*
* <p>This is really only needed/useful to maintain back-compat with the binary format. Once
* back-compat is no longer needed, the SNDV field should just be loaded directly.
*
* @deprecated Please do not rely on this method. It is added as a temporary measure for providing
* index backwards-compatibility with Lucene 8 and earlier indexes, and will be removed in
* Lucene 10.
*/
@Deprecated
public static SortedNumericDocValues loadOrdinalValues(
LeafReader reader, String fieldName, BiConsumer<BytesRef, IntsRef> binaryValueDecoder)
throws IOException {
if (reader.getMetaData().getCreatedVersionMajor() <= 8) {
BinaryDocValues oldStyleDocValues = reader.getBinaryDocValues(fieldName);
return BackCompatSortedNumericDocValues.wrap(oldStyleDocValues, binaryValueDecoder);
} else {
return reader.getSortedNumericDocValues(fieldName);
}
}
}

@ -28,6 +28,7 @@ import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
@ -409,9 +410,26 @@ public class FacetsConfig {
indexDrillDownTerms(doc, indexFieldName, dimConfig, facetLabel);
}
// Facet counts:
// DocValues are considered stored fields:
doc.add(new BinaryDocValuesField(indexFieldName, dedupAndEncode(ordinals.get())));
// Store the taxonomy ordinals associated with each doc. Prefer to use SortedNumericDocValues
// but "fall back" to a custom binary format to maintain backwards compatibility with Lucene 8
// indexes.
IntsRef ords = ordinals.get();
if (taxoWriter.useNumericDocValuesForOrdinals()) {
// Dedupe and encode the ordinals. It's not important that we sort here
// (SortedNumericDocValuesField will handle this internally), but we
// sort to identify dups (since SNDVF doesn't dedupe):
Arrays.sort(ords.ints, ords.offset, ords.offset + ords.length);
int prev = -1;
for (int i = 0; i < ords.length; i++) {
int ord = ords.ints[ords.offset + i];
if (ord > prev) {
doc.add(new SortedNumericDocValuesField(indexFieldName, ord));
prev = ord;
}
}
} else {
doc.add(new BinaryDocValuesField(indexFieldName, dedupAndEncode(ords)));
}
}
}
@ -507,7 +525,13 @@ public class FacetsConfig {
}
}
/** Encodes ordinals into a BytesRef; expert: subclass can override this to change encoding. */
/**
* Encodes ordinals into a BytesRef; expert: subclass can override this to change encoding.
*
* @deprecated Starting in Lucene 9, we moved to a more straight-forward numeric doc values
* encoding and no longer support custom binary encodings.
*/
@Deprecated
protected BytesRef dedupAndEncode(IntsRef ordinals) {
Arrays.sort(ordinals.ints, ordinals.offset, ordinals.length);
byte[] bytes = new byte[5 * ordinals.length];

@ -0,0 +1,148 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy;
import java.io.IOException;
import java.util.function.BiConsumer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
/**
* Wraps a {@link BinaryDocValues} instance, providing a {@link SortedNumericDocValues} interface
* for the purpose of being backwards-compatible. (see: LUCENE-10062)
*
* @deprecated Only here for back-compat support. Should be removed with Lucene 10.
*/
@Deprecated
public class BackCompatSortedNumericDocValues extends SortedNumericDocValues {
private final BinaryDocValues binaryDocValues;
private final BiConsumer<BytesRef, IntsRef> binaryValueDecoder;
private final IntsRef scratch = new IntsRef();
private int curr;
/**
* Wrap the provided binary encoded doc values. Decodes the binary values with the provided {@code
* binaryValueDecoder}, allowing the default decoding behavior to be overridden. If a null doc
* values instance is provided, the returned instance will also be null. If a null value decoder
* is specified, the default encoding will be assumed.
*/
public static SortedNumericDocValues wrap(
BinaryDocValues binaryDocValues, BiConsumer<BytesRef, IntsRef> binaryValueDecoder) {
if (binaryDocValues == null) {
return null;
}
return new BackCompatSortedNumericDocValues(binaryDocValues, binaryValueDecoder);
}
/** see the static {@code wrap} methods */
private BackCompatSortedNumericDocValues(
BinaryDocValues binaryDocValues, BiConsumer<BytesRef, IntsRef> binaryValueDecoder) {
assert binaryDocValues != null;
this.binaryDocValues = binaryDocValues;
if (binaryValueDecoder != null) {
this.binaryValueDecoder = binaryValueDecoder;
} else {
this.binaryValueDecoder = BackCompatSortedNumericDocValues::loadValues;
}
}
@Override
public boolean advanceExact(int target) throws IOException {
boolean result = binaryDocValues.advanceExact(target);
if (result) {
reloadValues();
}
return result;
}
@Override
public long nextValue() throws IOException {
curr++;
assert curr < scratch.length;
return scratch.ints[scratch.offset + curr];
}
@Override
public int docValueCount() {
return scratch.length;
}
@Override
public int docID() {
return binaryDocValues.docID();
}
@Override
public int nextDoc() throws IOException {
return advance(binaryDocValues.docID() + 1);
}
@Override
public int advance(int target) throws IOException {
int doc = binaryDocValues.advance(target);
if (doc != NO_MORE_DOCS) {
reloadValues();
}
return doc;
}
@Override
public long cost() {
return binaryDocValues.cost();
}
private void reloadValues() throws IOException {
curr = -1;
binaryValueDecoder.accept(binaryDocValues.binaryValue(), scratch);
}
/** Load ordinals for the currently-positioned doc, assuming the default binary encoding. */
static void loadValues(BytesRef buf, IntsRef ordinals) {
// grow the buffer up front, even if by a large number of values (buf.length)
// that saves the need to check inside the loop for every decoded value if
// the buffer needs to grow.
if (ordinals.ints.length < buf.length) {
ordinals.ints = ArrayUtil.grow(ordinals.ints, buf.length);
}
ordinals.offset = 0;
ordinals.length = 0;
// it is better if the decoding is inlined like so, and not e.g.
// in a utility method
int upto = buf.offset + buf.length;
int value = 0;
int offset = buf.offset;
int prev = 0;
while (offset < upto) {
byte b = buf.bytes[offset++];
if (b >= 0) {
ordinals.ints[ordinals.length] = ((value << 7) | b) + prev;
value = 0;
prev = ordinals.ints[ordinals.length];
ordinals.length++;
} else {
value = (value << 7) | (b & 0x7F);
}
}
}
}

@ -45,7 +45,11 @@ import org.apache.lucene.util.RamUsageEstimator;
*
* <p><b>NOTE:</b> create one instance of this and re-use it for all facet implementations (the
* cache is per-instance, not static).
*
* @deprecated Custom binary encodings for taxonomy ordinals are no longer supported starting with
* Lucene 9
*/
@Deprecated
public class CachedOrdinalsReader extends OrdinalsReader implements Accountable {
private final OrdinalsReader source;

@ -17,15 +17,22 @@
package org.apache.lucene.facet.taxonomy;
import java.io.IOException;
import org.apache.lucene.facet.FacetUtils;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
/** Decodes ordinals previously indexed into a BinaryDocValues field */
/**
* Decodes ordinals previously indexed into a BinaryDocValues field
*
* @deprecated Custom binary encodings for taxonomy ordinals are no longer supported starting with
* Lucene 9
*/
@Deprecated
public class DocValuesOrdinalsReader extends OrdinalsReader {
private final String field;
@ -41,12 +48,12 @@ public class DocValuesOrdinalsReader extends OrdinalsReader {
@Override
public OrdinalsSegmentReader getReader(LeafReaderContext context) throws IOException {
BinaryDocValues values0 = context.reader().getBinaryDocValues(field);
if (values0 == null) {
values0 = DocValues.emptyBinary();
SortedNumericDocValues dv0 =
FacetUtils.loadOrdinalValues(context.reader(), field, this::decode);
if (dv0 == null) {
dv0 = DocValues.emptySortedNumeric();
}
final BinaryDocValues values = values0;
final SortedNumericDocValues dv = dv0;
return new OrdinalsSegmentReader() {
@ -59,16 +66,21 @@ public class DocValuesOrdinalsReader extends OrdinalsReader {
"docs out of order: lastDocID=" + lastDocID + " vs docID=" + docID);
}
lastDocID = docID;
if (docID > values.docID()) {
values.advance(docID);
ordinals.offset = 0;
ordinals.length = 0;
if (dv.advanceExact(docID)) {
int count = dv.docValueCount();
if (ordinals.ints.length < count) {
ordinals.ints = ArrayUtil.grow(ordinals.ints, count);
}
for (int i = 0; i < count; i++) {
ordinals.ints[ordinals.length] = (int) dv.nextValue();
ordinals.length++;
}
}
final BytesRef bytes;
if (values.docID() == docID) {
bytes = values.binaryValue();
} else {
bytes = new BytesRef(BytesRef.EMPTY_BYTES);
}
decode(bytes, ordinals);
}
};
}
@ -91,33 +103,6 @@ public class DocValuesOrdinalsReader extends OrdinalsReader {
* @param ordinals buffer for decoded ordinals
*/
public void decode(BytesRef buf, IntsRef ordinals) {
// grow the buffer up front, even if by a large number of values (buf.length)
// that saves the need to check inside the loop for every decoded value if
// the buffer needs to grow.
if (ordinals.ints.length < buf.length) {
ordinals.ints = ArrayUtil.grow(ordinals.ints, buf.length);
}
ordinals.offset = 0;
ordinals.length = 0;
// it is better if the decoding is inlined like so, and not e.g.
// in a utility method
int upto = buf.offset + buf.length;
int value = 0;
int offset = buf.offset;
int prev = 0;
while (offset < upto) {
byte b = buf.bytes[offset++];
if (b >= 0) {
ordinals.ints[ordinals.length] = ((value << 7) | b) + prev;
value = 0;
prev = ordinals.ints[ordinals.length];
ordinals.length++;
} else {
value = (value << 7) | (b & 0x7F);
}
}
BackCompatSortedNumericDocValues.loadValues(buf, ordinals);
}
}

@ -19,17 +19,17 @@ package org.apache.lucene.facet.taxonomy;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.facet.FacetUtils;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ConjunctionUtils;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/**
* Computes facets counts, assuming the default encoding into DocValues was used.
@ -70,8 +70,9 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets {
private final void count(List<MatchingDocs> matchingDocs) throws IOException {
for (MatchingDocs hits : matchingDocs) {
BinaryDocValues dv = hits.context.reader().getBinaryDocValues(indexFieldName);
if (dv == null) { // this reader does not have DocValues for the requested category list
SortedNumericDocValues dv =
FacetUtils.loadOrdinalValues(hits.context.reader(), indexFieldName);
if (dv == null) {
continue;
}
@ -79,21 +80,8 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets {
ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), dv));
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
final BytesRef bytesRef = dv.binaryValue();
byte[] bytes = bytesRef.bytes;
int end = bytesRef.offset + bytesRef.length;
int ord = 0;
int offset = bytesRef.offset;
int prev = 0;
while (offset < end) {
byte b = bytes[offset++];
if (b >= 0) {
prev = ord = ((ord << 7) | b) + prev;
increment(ord);
ord = 0;
} else {
ord = (ord << 7) | (b & 0x7F);
}
for (int i = 0; i < dv.docValueCount(); i++) {
increment((int) dv.nextValue());
}
}
}
@ -103,8 +91,8 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets {
private final void countAll(IndexReader reader) throws IOException {
for (LeafReaderContext context : reader.leaves()) {
BinaryDocValues dv = context.reader().getBinaryDocValues(indexFieldName);
if (dv == null) { // this reader does not have DocValues for the requested category list
SortedNumericDocValues dv = FacetUtils.loadOrdinalValues(context.reader(), indexFieldName);
if (dv == null) {
continue;
}
@ -114,21 +102,9 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets {
if (liveDocs != null && liveDocs.get(doc) == false) {
continue;
}
final BytesRef bytesRef = dv.binaryValue();
byte[] bytes = bytesRef.bytes;
int end = bytesRef.offset + bytesRef.length;
int ord = 0;
int offset = bytesRef.offset;
int prev = 0;
while (offset < end) {
byte b = bytes[offset++];
if (b >= 0) {
prev = ord = ((ord << 7) | b) + prev;
increment(ord);
ord = 0;
} else {
ord = (ord << 7) | (b & 0x7F);
}
for (int i = 0; i < dv.docValueCount(); i++) {
increment((int) dv.nextValue());
}
}
}

@ -16,7 +16,9 @@
*/
package org.apache.lucene.facet.taxonomy;
import com.carrotsearch.hppc.IntArrayList;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.facet.FacetsConfig;
@ -26,7 +28,10 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.Ordina
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FilterBinaryDocValues;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.FilterSortedNumericDocValues;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
@ -107,6 +112,66 @@ public class OrdinalMappingLeafReader extends FilterLeafReader {
}
}
private class OrdinalMappingSortedNumericDocValues extends FilterSortedNumericDocValues {
private final IntArrayList currentValues;
private int currIndex;
OrdinalMappingSortedNumericDocValues(SortedNumericDocValues in) {
super(in);
currentValues = new IntArrayList(32);
}
@Override
public boolean advanceExact(int target) throws IOException {
boolean result = in.advanceExact(target);
if (result) {
reloadValues();
}
return result;
}
@Override
public int advance(int target) throws IOException {
int result = in.advance(target);
if (result != DocIdSetIterator.NO_MORE_DOCS) {
reloadValues();
}
return result;
}
@Override
public int nextDoc() throws IOException {
int result = in.nextDoc();
if (result != DocIdSetIterator.NO_MORE_DOCS) {
reloadValues();
}
return result;
}
@Override
public int docValueCount() {
return currentValues.elementsCount;
}
private void reloadValues() throws IOException {
currIndex = 0;
currentValues.clear();
for (int i = 0; i < in.docValueCount(); i++) {
int originalOrd = Math.toIntExact(in.nextValue());
currentValues.add(ordinalMap[originalOrd]);
}
Arrays.sort(currentValues.buffer, 0, currentValues.elementsCount);
}
@Override
public long nextValue() {
assert currIndex < currentValues.size();
int actual = currentValues.get(currIndex);
currIndex++;
return actual;
}
}
private final int[] ordinalMap;
private final InnerFacetsConfig facetsConfig;
private final Set<String> facetFields;
@ -125,31 +190,59 @@ public class OrdinalMappingLeafReader extends FilterLeafReader {
}
// always add the default indexFieldName. This is because FacetsConfig does
// not explicitly record dimensions that were indexed under the default
// DimConfig, unless they have a custome DimConfig.
// DimConfig, unless they have a custom DimConfig.
facetFields.add(FacetsConfig.DEFAULT_DIM_CONFIG.indexFieldName);
}
/**
* Expert: encodes category ordinals into a BytesRef. Override in case you use custom encoding,
* other than the default done by FacetsConfig.
*
* @deprecated Custom binary formats are no longer directly supported for taxonomy faceting
* starting in Lucene 9
*/
@Deprecated
protected BytesRef encode(IntsRef ordinals) {
return facetsConfig.dedupAndEncode(ordinals);
}
/** Expert: override in case you used custom encoding for the categories under this field. */
/**
* Expert: override in case you used custom encoding for the categories under this field.
*
* @deprecated Custom binary formats are no longer directly supported for taxonomy faceting
* starting in Lucene 9
*/
@Deprecated
protected OrdinalsReader getOrdinalsReader(String field) {
return new DocValuesOrdinalsReader(field);
}
@Override
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
if (facetFields.contains(field)) {
BinaryDocValues original = in.getBinaryDocValues(field);
if (original != null && facetFields.contains(field)) {
// The requested field is a facet ordinals field _and_ it's non-null, so move forward with
// mapping:
final OrdinalsReader ordsReader = getOrdinalsReader(field);
return new OrdinalMappingBinaryDocValues(
ordsReader.getReader(in.getContext()), in.getBinaryDocValues(field));
return new OrdinalMappingBinaryDocValues(ordsReader.getReader(in.getContext()), original);
} else {
return in.getBinaryDocValues(field);
// The requested field either isn't present (null) or isn't a facet ordinals field. Either
// way, just return the original:
return original;
}
}
@Override
public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
SortedNumericDocValues original = in.getSortedNumericDocValues(field);
if (original != null && facetFields.contains(field)) {
// The requested field is a facet ordinals field _and_ it's non-null, so move forward with
// mapping:
return new OrdinalMappingSortedNumericDocValues(original);
} else {
// The requested field either isn't present (null) or isn't a facet ordinals field. Either
// way, just return the original:
return original;
}
}

@ -20,7 +20,13 @@ import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.util.IntsRef;
/** Provides per-document ordinals. */
/**
* Provides per-document ordinals.
*
* @deprecated Custom binary encodings for taxonomy ordinals are no longer supported starting with
* Lucene 9
*/
@Deprecated
public abstract class OrdinalsReader {
/** Returns ordinals for documents in one segment. */

@ -29,8 +29,11 @@ import org.apache.lucene.util.IntsRef;
* Reads from any {@link OrdinalsReader}; use {@link FastTaxonomyFacetCounts} if you are using the
* default encoding from {@link BinaryDocValues}.
*
* @lucene.experimental
* @deprecated Custom binary encodings for taxonomy ordinals are no longer supported starting with
* Lucene 9. Please switch to {@link FastTaxonomyFacetCounts} or implement your own {@link
* org.apache.lucene.facet.Facets} implementation if you have custom needs.
*/
@Deprecated
public class TaxonomyFacetCounts extends IntTaxonomyFacets {
private final OrdinalsReader ordinalsReader;

@ -20,7 +20,10 @@ import static org.apache.lucene.facet.taxonomy.TaxonomyReader.INVALID_ORDINAL;
import static org.apache.lucene.facet.taxonomy.TaxonomyReader.ROOT_ORDINAL;
import java.io.IOException;
import org.apache.lucene.facet.FacetUtils;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.util.IntsRef;
/**
@ -34,19 +37,16 @@ public class TaxonomyFacetLabels {
/** {@code TaxonomyReader} provided to the constructor */
private final TaxonomyReader taxoReader;
/**
* {@code OrdinalsReader} to decode ordinals previously indexed into the {@code BinaryDocValues}
* facet field
*/
private final OrdinalsReader ordsReader;
/** field storing the taxonomy ordinals */
private final String indexFieldName;
/**
* Sole constructor. Do not close the provided {@link TaxonomyReader} while still using this
* instance!
*/
public TaxonomyFacetLabels(TaxonomyReader taxoReader, String indexFieldName) throws IOException {
public TaxonomyFacetLabels(TaxonomyReader taxoReader, String indexFieldName) {
this.taxoReader = taxoReader;
this.ordsReader = new DocValuesOrdinalsReader(indexFieldName);
this.indexFieldName = indexFieldName;
}
/**
@ -62,7 +62,13 @@ public class TaxonomyFacetLabels {
* @throws IOException when a low-level IO issue occurs
*/
public FacetLabelReader getFacetLabelReader(LeafReaderContext readerContext) throws IOException {
return new FacetLabelReader(ordsReader, readerContext);
SortedNumericDocValues ordinalValues =
FacetUtils.loadOrdinalValues(readerContext.reader(), indexFieldName);
if (ordinalValues == null) {
ordinalValues = DocValues.emptySortedNumeric();
}
return new FacetLabelReader(ordinalValues);
}
/**
@ -71,18 +77,50 @@ public class TaxonomyFacetLabels {
* @lucene.experimental
*/
public class FacetLabelReader {
/** By default, we store taxonomy ordinals in SortedNumericDocValues field */
private final SortedNumericDocValues ordinalValues;
/**
* Users can provide their own custom OrdinalsReader for cases where the default encoding isn't
* used. This capability is deprecated and will be removed in Lucene 10.
*/
private final OrdinalsReader.OrdinalsSegmentReader ordinalsSegmentReader;
private final IntsRef decodedOrds = new IntsRef();
private final IntsRef decodedOrds;
private int currentDocId = -1;
private int currentPos = -1;
private boolean currentDocHasValues;
private int currentPos;
private int currentDocOrdinalCount;
// Lazily set when nextFacetLabel(int docId, String facetDimension) is first called
private int[] parents;
/** Sole constructor. */
/**
* Construct from a specified {@link SortedNumericDocValues} field; useful for reading the
* default encoding.
*/
public FacetLabelReader(SortedNumericDocValues ordinalValues) {
this.ordinalValues = ordinalValues;
ordinalsSegmentReader = null;
decodedOrds = null;
}
/**
* Construct using a custom {@link OrdinalsReader}; useful if using a custom binary format.
*
* <p>Note: If using the default encoding, you can use {@link
* #FacetLabelReader(SortedNumericDocValues)} directly
*
* @deprecated Custom binary encodings for taxonomy ordinals are no longer supported starting
* with Lucene 9
*/
@Deprecated
public FacetLabelReader(OrdinalsReader ordsReader, LeafReaderContext readerContext)
throws IOException {
ordinalsSegmentReader = ordsReader.getReader(readerContext);
decodedOrds = new IntsRef();
ordinalValues = null;
}
/**
@ -108,20 +146,45 @@ public class TaxonomyFacetLabels {
throw new IllegalArgumentException(
"docs out of order: previous docId=" + currentDocId + " current docId=" + docId);
}
ordinalsSegmentReader.get(docId, decodedOrds);
currentDocId = docId;
currentPos = decodedOrds.offset;
if (ordinalsSegmentReader != null) {
ordinalsSegmentReader.get(docId, decodedOrds);
currentPos = decodedOrds.offset;
} else {
currentDocHasValues = ordinalValues.advanceExact(docId);
if (currentDocHasValues) {
currentDocOrdinalCount = ordinalValues.docValueCount();
currentPos = 0;
}
}
}
int endPos = decodedOrds.offset + decodedOrds.length;
assert currentPos <= endPos;
int ord;
if (ordinalsSegmentReader != null) {
int endPos = decodedOrds.offset + decodedOrds.length;
assert currentPos <= endPos;
if (currentPos == endPos) {
// no more FacetLabels
return null;
if (currentPos == endPos) {
return null;
}
ord = decodedOrds.ints[currentPos++];
} else {
if (currentDocHasValues == false) {
return null;
}
assert currentPos <= currentDocOrdinalCount;
if (currentPos == currentDocOrdinalCount) {
return null;
}
ord = (int) ordinalValues.nextValue();
currentPos++;
}
int ord = decodedOrds.ints[currentPos++];
return taxoReader.getPath(ord);
}
@ -168,24 +231,61 @@ public class TaxonomyFacetLabels {
throw new IllegalArgumentException(
"docs out of order: previous docId=" + currentDocId + " current docId=" + docId);
}
ordinalsSegmentReader.get(docId, decodedOrds);
currentPos = decodedOrds.offset;
currentDocId = docId;
}
if (parents == null) {
parents = taxoReader.getParallelTaxonomyArrays().parents();
}
int endPos = decodedOrds.offset + decodedOrds.length;
assert currentPos <= endPos;
for (; currentPos < endPos; ) {
int ord = decodedOrds.ints[currentPos++];
if (isDescendant(ord, parentOrd) == true) {
return taxoReader.getPath(ord);
if (ordinalsSegmentReader != null) {
ordinalsSegmentReader.get(docId, decodedOrds);
currentPos = decodedOrds.offset;
} else {
currentDocHasValues = ordinalValues.advanceExact(docId);
if (currentDocHasValues) {
currentDocOrdinalCount = ordinalValues.docValueCount();
currentPos = 0;
}
}
}
if (ordinalsSegmentReader != null) {
int endPos = decodedOrds.offset + decodedOrds.length;
assert currentPos <= endPos;
if (currentPos == endPos) {
return null;
}
if (parents == null) {
parents = taxoReader.getParallelTaxonomyArrays().parents();
}
do {
int ord = decodedOrds.ints[currentPos++];
if (isDescendant(ord, parentOrd) == true) {
return taxoReader.getPath(ord);
}
} while (currentPos < endPos);
} else {
if (currentDocHasValues == false) {
return null;
}
assert currentPos <= currentDocOrdinalCount;
if (currentPos == currentDocOrdinalCount) {
return null;
}
if (parents == null) {
parents = taxoReader.getParallelTaxonomyArrays().parents();
}
do {
int ord = (int) ordinalValues.nextValue();
currentPos++;
if (isDescendant(ord, parentOrd) == true) {
return taxoReader.getPath(ord);
}
} while (currentPos < currentDocOrdinalCount);
}
return null;
}
}

@ -18,9 +18,12 @@ package org.apache.lucene.facet.taxonomy;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.facet.FacetUtils;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ConjunctionUtils;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DoubleValues;
import org.apache.lucene.search.DoubleValuesSource;
@ -36,8 +39,7 @@ public class TaxonomyFacetSumValueSource extends FloatTaxonomyFacets {
/**
* Aggreggates double facet values from the provided {@link DoubleValuesSource}, pulling ordinals
* using {@link DocValuesOrdinalsReader} against the default indexed facet field {@link
* FacetsConfig#DEFAULT_INDEX_FIELD_NAME}.
* from the default indexed facet field {@link FacetsConfig#DEFAULT_INDEX_FIELD_NAME}.
*/
public TaxonomyFacetSumValueSource(
TaxonomyReader taxoReader,
@ -45,18 +47,33 @@ public class TaxonomyFacetSumValueSource extends FloatTaxonomyFacets {
FacetsCollector fc,
DoubleValuesSource valueSource)
throws IOException {
this(
new DocValuesOrdinalsReader(FacetsConfig.DEFAULT_INDEX_FIELD_NAME),
taxoReader,
config,
fc,
valueSource);
this(FacetsConfig.DEFAULT_INDEX_FIELD_NAME, taxoReader, config, fc, valueSource);
}
/**
* Aggreggates double facet values from the provided {@link DoubleValuesSource}, pulling ordinals
* from the specified indexed facet field.
*/
public TaxonomyFacetSumValueSource(
String indexField,
TaxonomyReader taxoReader,
FacetsConfig config,
FacetsCollector fc,
DoubleValuesSource valueSource)
throws IOException {
super(indexField, taxoReader, config);
ordinalsReader = null;
sumValues(fc.getMatchingDocs(), fc.getKeepScores(), valueSource);
}
/**
* Aggreggates float facet values from the provided {@link DoubleValuesSource}, and pulls ordinals
* from the provided {@link OrdinalsReader}.
*
* @deprecated Custom binary encodings for taxonomy ordinals are no longer supported starting with
* Lucene 9
*/
@Deprecated
public TaxonomyFacetSumValueSource(
OrdinalsReader ordinalsReader,
TaxonomyReader taxoReader,
@ -91,20 +108,47 @@ public class TaxonomyFacetSumValueSource extends FloatTaxonomyFacets {
List<MatchingDocs> matchingDocs, boolean keepScores, DoubleValuesSource valueSource)
throws IOException {
IntsRef scratch = new IntsRef();
for (MatchingDocs hits : matchingDocs) {
OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context);
DoubleValues scores = keepScores ? scores(hits) : null;
DoubleValues functionValues = valueSource.getValues(hits.context, scores);
DocIdSetIterator docs = hits.bits.iterator();
if (ordinalsReader != null) {
// If the user provided a custom ordinals reader, use it to retrieve the document ordinals:
IntsRef scratch = new IntsRef();
for (MatchingDocs hits : matchingDocs) {
OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context);
DoubleValues scores = keepScores ? scores(hits) : null;
DoubleValues functionValues = valueSource.getValues(hits.context, scores);
DocIdSetIterator docs = hits.bits.iterator();
int doc;
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
ords.get(doc, scratch);
if (functionValues.advanceExact(doc)) {
float value = (float) functionValues.doubleValue();
for (int i = 0; i < scratch.length; i++) {
values[scratch.ints[i]] += value;
int doc;
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
ords.get(doc, scratch);
if (functionValues.advanceExact(doc)) {
float value = (float) functionValues.doubleValue();
for (int i = 0; i < scratch.length; i++) {
values[scratch.ints[i]] += value;
}
}
}
}
} else {
// If no custom ordinals reader is provided, expect the default encoding:
for (MatchingDocs hits : matchingDocs) {
SortedNumericDocValues ordinalValues =
FacetUtils.loadOrdinalValues(hits.context.reader(), indexFieldName);
if (ordinalValues == null) {
continue;
}
DoubleValues scores = keepScores ? scores(hits) : null;
DoubleValues functionValues = valueSource.getValues(hits.context, scores);
DocIdSetIterator it =
ConjunctionUtils.intersectIterators(List.of(hits.bits.iterator(), ordinalValues));
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
if (functionValues.advanceExact(doc)) {
float value = (float) functionValues.doubleValue();
int ordinalCount = ordinalValues.docValueCount();
for (int i = 0; i < ordinalCount; i++) {
values[(int) ordinalValues.nextValue()] += value;
}
}
}
}

@ -97,4 +97,18 @@ public interface TaxonomyWriter extends Closeable, TwoPhaseCommit {
/** Returns the commit user data iterable that was set on {@link #setLiveCommitData(Iterable)}. */
public Iterable<Map.Entry<String, String>> getLiveCommitData();
/**
* Determine whether-or-not to store taxonomy ordinals for each document using the older binary
* format or the newer SortedNumericDocValues format (based on the version used to create the
* index).
*
* @deprecated Please don't rely on this method as it will be removed in Lucene 10. It's being
* introduced to support backwards-compatibility with Lucene 8 and earlier index formats
* temporarily.
*/
@Deprecated
default boolean useNumericDocValuesForOrdinals() {
return false;
}
}

@ -162,7 +162,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
openMode = config.getOpenMode();
if (DirectoryReader.indexExists(directory) == false) {
indexEpoch = 1;
// no commit exists so we can safely use the new BinaryDocValues field
// no commit exists so we can safely use the newer formats:
useOlderFormat = false;
} else {
String epochStr = null;
@ -1005,4 +1005,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
public final long getTaxonomyEpoch() {
return indexEpoch;
}
@Override
public boolean useNumericDocValuesForOrdinals() {
return useOlderFormat == false;
}
}

@ -190,7 +190,7 @@ public class TestMultipleIndexFields extends FacetTestCase {
private void assertOrdinalsExist(String field, IndexReader ir) throws IOException {
for (LeafReaderContext context : ir.leaves()) {
LeafReader r = context.reader();
if (r.getBinaryDocValues(field) != null) {
if (r.getSortedNumericDocValues(field) != null) {
return; // not all segments must have this DocValues
}
}

@ -0,0 +1,140 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
public class TestBackCompatSortedNumericDocValues extends LuceneTestCase {
private static class FacetsConfigWrapper extends FacetsConfig {
public BytesRef encodeValues(IntsRef values) {
return dedupAndEncode(values);
}
}
public void testRandom() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
// sorta big scratch so we don't have to think about reallocating:
IntsRef scratch = new IntsRef(100);
// used to access default binary encoding easily:
FacetsConfigWrapper facetsConfig = new FacetsConfigWrapper();
// keep track of the values we expect to see for each doc:
Map<String, List<Integer>> expectedValues = new HashMap<>();
int numDocs = atLeast(100);
for (int i = 0; i < numDocs; i++) {
int numValues = RandomNumbers.randomIntBetween(random(), 1, 50);
scratch.length = 0;
scratch.offset = 0;
Set<Integer> values = new HashSet<>();
for (int j = 0; j < numValues; j++) {
int value = random().nextInt(Integer.MAX_VALUE);
values.add(value);
// we might have dups in here, which is fine (encoding takes care of deduping and sorting):
scratch.ints[j] = value;
scratch.length++;
}
// we expect to get sorted and deduped values back out:
expectedValues.put(String.valueOf(i), values.stream().sorted().collect(Collectors.toList()));
Document doc = new Document();
doc.add(new StoredField("id", String.valueOf(i)));
doc.add(new BinaryDocValuesField("bdv", facetsConfig.encodeValues(scratch)));
writer.addDocument(doc);
}
writer.forceMerge(1);
writer.commit();
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
writer.close();
assert reader.leaves().size() == 1;
BinaryDocValues binaryDocValues = reader.leaves().get(0).reader().getBinaryDocValues("bdv");
assertNotNull(binaryDocValues);
SortedNumericDocValues docValues = BackCompatSortedNumericDocValues.wrap(binaryDocValues, null);
TopFieldDocs docs = searcher.search(new MatchAllDocsQuery(), numDocs, Sort.INDEXORDER);
for (ScoreDoc scoreDoc : docs.scoreDocs) {
String id = reader.document(scoreDoc.doc).get("id");
int docId = scoreDoc.doc;
int doc;
if (random().nextBoolean()) {
doc = docValues.nextDoc();
} else {
if (random().nextBoolean()) {
doc = docValues.advance(docId);
} else {
assertTrue(docValues.advanceExact(docId));
doc = docId;
}
}
assertEquals(docId, doc);
assertEquals(docId, docValues.docID());
List<Integer> expected = expectedValues.get(id);
assertEquals(expected.size(), docValues.docValueCount());
checkValues(expected, docValues);
}
// Run off the end and make sure that case is handled gracefully:
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.nextDoc());
IOUtils.close(reader, dir);
}
private void checkValues(List<Integer> expected, SortedNumericDocValues values)
throws IOException {
for (Integer e : expected) {
assertEquals((long) e, values.nextValue());
}
}
}

@ -410,9 +410,15 @@ public class TestTaxonomyFacetSumValueSource extends FacetTestCase {
FacetsCollector.search(newSearcher(r), new MatchAllDocsQuery(), 10, fc);
Facets facets1 = getTaxonomyFacetCounts(taxoReader, config, fc);
Facets facets2 =
new TaxonomyFacetSumValueSource(
new DocValuesOrdinalsReader("$b"), taxoReader, config, fc, DoubleValuesSource.SCORES);
Facets facets2;
if (random().nextBoolean()) {
facets2 =
new TaxonomyFacetSumValueSource(
new DocValuesOrdinalsReader("$b"), taxoReader, config, fc, DoubleValuesSource.SCORES);
} else {
facets2 =
new TaxonomyFacetSumValueSource("$b", taxoReader, config, fc, DoubleValuesSource.SCORES);
}
assertEquals(r.maxDoc(), facets1.getTopChildren(10, "a").value.intValue());
assertEquals(r.maxDoc(), facets2.getTopChildren(10, "b").value.doubleValue(), 1E-10);

@ -20,10 +20,32 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.facet.DrillDownQuery;
import org.apache.lucene.facet.FacetField;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.DocValuesOrdinalsReader;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts;
import org.apache.lucene.facet.taxonomy.TaxonomyFacetCounts;
import org.apache.lucene.facet.taxonomy.TaxonomyFacetLabels;
import org.apache.lucene.facet.taxonomy.TaxonomyFacetSumValueSource;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.DoubleValuesSource;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.Ignore;
@ -49,50 +71,196 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
//
// Then move the zip file to your trunk checkout and use it in your test cases
public static final String oldTaxonomyIndexName = "taxonomy.8.10.0-cfs";
private static final String OLD_TAXONOMY_INDEX_NAME = "taxonomy.8.11.0-cfs";
private static final String OLD_INDEX_NAME = "index.8.11.0-cfs";
public void testCreateNewTaxonomy() throws IOException {
createNewTaxonomyIndex(oldTaxonomyIndexName);
createNewTaxonomyIndex(OLD_TAXONOMY_INDEX_NAME, OLD_INDEX_NAME);
}
// Opens up a pre-existing old taxonomy index and adds new BinaryDocValues based fields
private void createNewTaxonomyIndex(String dirName) throws IOException {
Path indexDir = createTempDir(oldTaxonomyIndexName);
TestUtil.unzip(getDataInputStream(dirName + ".zip"), indexDir);
Directory dir = newFSDirectory(indexDir);
/**
* This test exercises a bunch of different faceting operations and directly taxonomy index
* reading to make sure more modern faceting formats introduced in 9.0 are backwards-compatible
* with 8.x indexes. It requires an "older" 8.x index to be in place with assumed docs/categories
* already present. It makes sure it can still run a number of different "read" operations against
* the old index, then it writes new content, forces a merge and does a bunch more "read"
* operations. It may seem a bit chaotic, but it's trying to test a number of different
* faceting-related implementations that require specific back-compat support.
*/
private void createNewTaxonomyIndex(String taxoDirName, String indexDirName) throws IOException {
Path taxoPath = createTempDir(taxoDirName);
Path indexPath = createTempDir(indexDirName);
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
TestUtil.unzip(getDataInputStream(taxoDirName + ".zip"), taxoPath);
TestUtil.unzip(getDataInputStream(indexDirName + ".zip"), indexPath);
Directory taxoDir = newFSDirectory(taxoPath);
Directory indexDir = newFSDirectory(indexPath);
// Open the existing indexes (explicitly open in APPEND mode and fail if they don't exist since
// we're trying to test
// back-compat with existing indexes):
DirectoryTaxonomyWriter taxoWriter =
new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.APPEND);
IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
RandomIndexWriter indexWriter =
new RandomIndexWriter(random(), indexDir, indexWriterConfig, random().nextBoolean());
// Use a default FacetsConfig. This assumes that we didn't need to register anything interesting
// when creating
// the older format index. If that changes, we need a way to ensure we re-use the same facet
// configuration used
// in created the old format taxonomy index:
FacetsConfig facetsConfig = new FacetsConfig();
// At this point we should have a taxonomy index and "regular" index containing some taxonomy
// categories and
// documents with facet ordinals indexed. Confirm that we can facet and search against it as-is
// before adding
// anything new. Of course these tests are highly dependent on the index we're starting with, so
// they will
// need to be updated accordingly if the "old" test index changes:
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
IndexSearcher searcher = newSearcher(indexWriter.getReader());
FacetsCollector facetsCollector = new FacetsCollector();
searcher.search(new MatchAllDocsQuery(), facetsCollector);
// Test a few different facet implementations that we know have back-compat implications:
Facets facets = new FastTaxonomyFacetCounts(taxoReader, facetsConfig, facetsCollector);
FacetResult facetResult = facets.getTopChildren(10, "f1");
assertEquals(2, facetResult.value);
facets =
new TaxonomyFacetCounts(
new DocValuesOrdinalsReader(), taxoReader, facetsConfig, facetsCollector);
facetResult = facets.getTopChildren(10, "f1");
assertEquals(2, facetResult.value);
facets =
new TaxonomyFacetSumValueSource(
taxoReader, facetsConfig, facetsCollector, DoubleValuesSource.constant(1d));
facetResult = facets.getTopChildren(10, "f1");
assertEquals(2.0f, facetResult.value);
// Test that we can drill-down as expected (and read facet labels from matching docs):
TaxonomyFacetLabels facetLabels =
new TaxonomyFacetLabels(taxoReader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME);
assert (searcher.getIndexReader().leaves().size() == 1);
TaxonomyFacetLabels.FacetLabelReader labelReader =
facetLabels.getFacetLabelReader(searcher.getIndexReader().leaves().get(0));
DrillDownQuery query = new DrillDownQuery(facetsConfig, new MatchAllDocsQuery());
query.add("f1", "foo");
TopDocs docResults = searcher.search(query, 10);
assertEquals(1, docResults.totalHits.value);
int docId = docResults.scoreDocs[0].doc;
Set<FacetLabel> labels = new HashSet<>();
for (FacetLabel label = labelReader.nextFacetLabel(docId);
label != null;
label = labelReader.nextFacetLabel(docId)) {
labels.add(label);
}
assertEquals(2, labels.size());
assertTrue(
labels.containsAll(List.of(new FacetLabel("f1", "foo"), new FacetLabel("f2", "foo"))));
assertEquals(0, docResults.scoreDocs[0].doc);
// And make sure we can read directly from the taxonomy like we'd expect:
int ord = taxoReader.getOrdinal(new FacetLabel("f1", "foo"));
assertNotEquals(TaxonomyReader.INVALID_ORDINAL, ord);
assertNotNull(taxoReader.getPath(ord));
// Now we'll add some new docs and taxonomy categories, force merge (to make sure that goes
// well) and then do
// some more searches, etc.:
Document doc = new Document();
doc.add(new FacetField("f1", "zed"));
indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
FacetLabel cp_c = new FacetLabel("c");
writer.addCategory(cp_c);
writer.getInternalIndexWriter().forceMerge(1);
writer.commit();
taxoWriter.addCategory(cp_c);
TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
indexWriter.forceMerge(1);
taxoWriter.getInternalIndexWriter().forceMerge(1);
indexWriter.commit();
taxoWriter.commit();
int ord1 = reader.getOrdinal(new FacetLabel("a"));
assert ord1 != TaxonomyReader.INVALID_ORDINAL;
// Just asserting ord1 != TaxonomyReader.INVALID_ORDINAL is not enough to check compatibility
assertNotNull(reader.getPath(ord1));
IOUtils.close(taxoReader, searcher.getIndexReader());
taxoReader = new DirectoryTaxonomyReader(taxoWriter);
searcher = newSearcher(indexWriter.getReader());
IOUtils.close(indexWriter, taxoWriter);
int ord2 = reader.getOrdinal(new FacetLabel("b"));
assert ord2 != TaxonomyReader.INVALID_ORDINAL;
// Re-test a number of different use-cases, which should now "see" the newly added content:
facetsCollector = new FacetsCollector();
searcher.search(new MatchAllDocsQuery(), facetsCollector);
facets = new FastTaxonomyFacetCounts(taxoReader, facetsConfig, facetsCollector);
facetResult = facets.getTopChildren(10, "f1");
assertEquals(3, facetResult.value);
facets =
new TaxonomyFacetCounts(
new DocValuesOrdinalsReader(), taxoReader, facetsConfig, facetsCollector);
facetResult = facets.getTopChildren(10, "f1");
assertEquals(3, facetResult.value);
facets =
new TaxonomyFacetSumValueSource(
taxoReader, facetsConfig, facetsCollector, DoubleValuesSource.constant(1d));
facetResult = facets.getTopChildren(10, "f1");
assertEquals(3.0f, facetResult.value);
// Test that we can drill-down as expected, and access facet labels:
facetLabels = new TaxonomyFacetLabels(taxoReader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME);
assert (searcher.getIndexReader().leaves().size() == 1);
labelReader = facetLabels.getFacetLabelReader(searcher.getIndexReader().leaves().get(0));
query = new DrillDownQuery(facetsConfig, new MatchAllDocsQuery());
query.add("f1", "foo");
docResults = searcher.search(query, 10);
assertEquals(1, docResults.totalHits.value);
docId = docResults.scoreDocs[0].doc;
labels = new HashSet<>();
for (FacetLabel label = labelReader.nextFacetLabel(docId);
label != null;
label = labelReader.nextFacetLabel(docId)) {
labels.add(label);
}
assertEquals(2, labels.size());
assertTrue(
labels.containsAll(List.of(new FacetLabel("f1", "foo"), new FacetLabel("f2", "foo"))));
labelReader = facetLabels.getFacetLabelReader(searcher.getIndexReader().leaves().get(0));
query = new DrillDownQuery(facetsConfig, new MatchAllDocsQuery());
query.add("f1", "zed");
docResults = searcher.search(query, 10);
assertEquals(1, docResults.totalHits.value);
docId = docResults.scoreDocs[0].doc;
labels = new HashSet<>();
for (FacetLabel label = labelReader.nextFacetLabel(docId);
label != null;
label = labelReader.nextFacetLabel(docId)) {
labels.add(label);
}
assertEquals(1, labels.size());
assertTrue(labels.contains(new FacetLabel("f1", "zed")));
// And make sure we can read directly from the taxonomy like we'd expect:
ord = taxoReader.getOrdinal(new FacetLabel("f1", "foo"));
assertNotEquals(TaxonomyReader.INVALID_ORDINAL, ord);
assertNotNull(taxoReader.getPath(ord));
ord = taxoReader.getOrdinal(new FacetLabel("f1", "zed"));
assertNotEquals(TaxonomyReader.INVALID_ORDINAL, ord);
assertNotNull(taxoReader.getPath(ord));
// And check a few more direct reads from the taxonomy:
ord = taxoReader.getOrdinal(new FacetLabel("a"));
assertNotEquals(TaxonomyReader.INVALID_ORDINAL, ord);
assertNotNull(taxoReader.getPath(ord));
ord = taxoReader.getOrdinal(new FacetLabel("b"));
assertNotEquals(TaxonomyReader.INVALID_ORDINAL, ord);
// Just asserting ord2 != TaxonomyReader.INVALID_ORDINAL is not enough to check compatibility
assertNotNull(reader.getPath(ord2));
assertNotNull(taxoReader.getPath(ord));
int ord3 = reader.getOrdinal(cp_c);
assert ord3 != TaxonomyReader.INVALID_ORDINAL;
assertNotNull(reader.getPath(ord3));
ord = taxoReader.getOrdinal(cp_c);
assertNotEquals(TaxonomyReader.INVALID_ORDINAL, ord);
assertNotNull(taxoReader.getPath(ord));
reader.close();
writer.close();
dir.close();
IOUtils.close(taxoReader, searcher.getIndexReader(), taxoDir, indexDir);
}
// Opens up a pre-existing index and tries to run getBulkPath on it
public void testGetBulkPathOnOlderCodec() throws Exception {
Path indexDir = createTempDir(oldTaxonomyIndexName);
TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
Path indexDir = createTempDir(OLD_TAXONOMY_INDEX_NAME);
TestUtil.unzip(getDataInputStream(OLD_TAXONOMY_INDEX_NAME + ".zip"), indexDir);
Directory dir = newFSDirectory(indexDir);
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
@ -114,21 +282,41 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
// Used to create a fresh taxonomy index with StoredFields
@Ignore
public void testCreateOldTaxonomy() throws IOException {
createOldTaxonomyIndex(oldTaxonomyIndexName);
createOldTaxonomyIndex(OLD_TAXONOMY_INDEX_NAME, OLD_INDEX_NAME);
}
private void createOldTaxonomyIndex(String dirName) throws IOException {
Path indexDir = getIndexDir().resolve(dirName);
Files.deleteIfExists(indexDir);
Directory dir = newFSDirectory(indexDir);
private void createOldTaxonomyIndex(String taxoDirName, String indexDirName) throws IOException {
Path taxoPath = getIndexDir().resolve(taxoDirName);
Path indexPath = getIndexDir().resolve(indexDirName);
Files.deleteIfExists(taxoPath);
Files.deleteIfExists(indexPath);
TaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
Directory taxoDir = newFSDirectory(taxoPath);
Directory indexDir = newFSDirectory(indexPath);
writer.addCategory(new FacetLabel("a"));
writer.addCategory(new FacetLabel("b"));
writer.commit();
writer.close();
dir.close();
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
FacetsConfig facetsConfig = new FacetsConfig();
RandomIndexWriter indexWriter = new RandomIndexWriter(random(), indexDir);
Document doc = new Document();
doc.add(new FacetField("f1", "foo"));
doc.add(new FacetField("f2", "foo"));
indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("f1", "bar"));
indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
doc = new Document();
doc.add(new FacetField("f2", "bar"));
indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
taxoWriter.addCategory(new FacetLabel("a"));
taxoWriter.addCategory(new FacetLabel("b"));
indexWriter.commit();
taxoWriter.commit();
IOUtils.close(indexWriter, taxoWriter, indexDir, taxoDir);
}
private Path getIndexDir() {