From 24aadc220ba9578f581637b9fd0e7e973d46426c Mon Sep 17 00:00:00 2001 From: goankur <47429106+goankur@users.noreply.github.com> Date: Mon, 28 Sep 2020 07:55:37 -0700 Subject: [PATCH] LUCENE-9444: add utility class to retrieve facet labels from the taxonomy index for a facet field (#1893) LUCENE-9444: add utility class to retrieve facet labels from the taxonomy index for a facet field so such fields do not also have to be redundantly stored in the index. Co-authored-by: Ankur Goel --- .../facet/taxonomy/TaxonomyFacetLabels.java | 195 ++++++++++++++++++ .../apache/lucene/facet/FacetTestCase.java | 42 +++- .../taxonomy/TestTaxonomyFacetCounts.java | 44 +++- .../taxonomy/TestTaxonomyFacetLabels.java | 194 +++++++++++++++++ 4 files changed, 472 insertions(+), 3 deletions(-) create mode 100644 lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetLabels.java create mode 100644 lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetLabels.java diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetLabels.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetLabels.java new file mode 100644 index 00000000000..d368e74de2e --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetLabels.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet.taxonomy; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.util.IntsRef; + +import java.io.IOException; + +import static org.apache.lucene.facet.taxonomy.TaxonomyReader.INVALID_ORDINAL; +import static org.apache.lucene.facet.taxonomy.TaxonomyReader.ROOT_ORDINAL; + +/** + * Utility class to easily retrieve previously indexed facet labels, allowing you to skip also adding stored fields for these values, + * reducing your index size. + * + * @lucene.experimental + **/ +public class TaxonomyFacetLabels { + + /** + * Index field name provided to the constructor + */ + private final String indexFieldName; + + /** + * {@code TaxonomyReader} provided to the constructor + */ + private final TaxonomyReader taxoReader; + + + /** + * {@code OrdinalsReader} to decode ordinals previously indexed into the {@code BinaryDocValues} facet field + */ + private final OrdinalsReader ordsReader; + + /** + * Sole constructor. Do not close the provided {@link TaxonomyReader} while still using this instance! + */ + public TaxonomyFacetLabels(TaxonomyReader taxoReader, String indexFieldName) throws IOException { + this.taxoReader = taxoReader; + this.indexFieldName = indexFieldName; + this.ordsReader = new DocValuesOrdinalsReader(indexFieldName); + } + + /** + * Create and return an instance of {@link FacetLabelReader} to retrieve facet labels for + * multiple documents and (optionally) for a specific dimension. You must create this per-segment, + * and then step through all hits, in order, for that segment. + * + *

NOTE: This class is not thread-safe, so you must use a new instance of this + * class for each thread.

+ * + * @param readerContext LeafReaderContext used to access the {@code BinaryDocValues} facet field + * @return an instance of {@link FacetLabelReader} + * @throws IOException when a low-level IO issue occurs + */ + public FacetLabelReader getFacetLabelReader(LeafReaderContext readerContext) throws IOException { + return new FacetLabelReader(ordsReader, readerContext); + } + + /** + * Utility class to retrieve facet labels for multiple documents. + * + * @lucene.experimental + */ + public class FacetLabelReader { + private final OrdinalsReader.OrdinalsSegmentReader ordinalsSegmentReader; + private final IntsRef decodedOrds = new IntsRef(); + private int currentDocId = -1; + private int currentPos = -1; + + // Lazily set when nextFacetLabel(int docId, String facetDimension) is first called + private int[] parents; + + /** + * Sole constructor. + */ + public FacetLabelReader(OrdinalsReader ordsReader, LeafReaderContext readerContext) throws IOException { + ordinalsSegmentReader = ordsReader.getReader(readerContext); + } + + /** + * Retrieves the next {@link FacetLabel} for the specified {@code docId}, or {@code null} if there are no more. + * This method has state: if the provided {@code docId} is the same as the previous invocation, it returns the + * next {@link FacetLabel} for that document. Otherwise, it advances to the new {@code docId} and provides the + * first {@link FacetLabel} for that document, or {@code null} if that document has no indexed facets. Each + * new {@code docId} must be in strictly monotonic (increasing) order. + * + *

NOTE: The returned FacetLabels may not be in the same order in which they were indexed

+ * + * @param docId input docId provided in monotonic (non-decreasing) order + * @return the first or next {@link FacetLabel}, or {@code null} if there are no more + * @throws IOException when a low-level IO issue occurs + * @throws IllegalArgumentException if docId provided is less than docId supplied in an earlier invocation + */ + public FacetLabel nextFacetLabel(int docId) throws IOException { + if (currentDocId != docId) { + if (docId < currentDocId) { + throw new IllegalArgumentException("docs out of order: previous docId=" + currentDocId + + " current docId=" + docId); + } + ordinalsSegmentReader.get(docId, decodedOrds); + currentDocId = docId; + currentPos = decodedOrds.offset; + } + + int endPos = decodedOrds.offset + decodedOrds.length; + assert currentPos <= endPos; + + if (currentPos == endPos) { + // no more FacetLabels + return null; + } + + int ord = decodedOrds.ints[currentPos++]; + return taxoReader.getPath(ord); + } + + private boolean isDescendant(int ord, int ancestorOrd) { + while (ord != INVALID_ORDINAL && ord != ROOT_ORDINAL) { + if (parents[ord] == ancestorOrd) { + return true; + } + ord = parents[ord]; + } + return false; + } + + /** + * Retrieves the next {@link FacetLabel} for the specified {@code docId} under the requested {@code facetDimension}, + * or {@code null} if there are no more. This method has state: if the provided {@code docId} is the same as the + * previous invocation, it returns the next {@link FacetLabel} for that document. Otherwise, it advances to + * the new {@code docId} and provides the first {@link FacetLabel} for that document, or {@code null} if that document + * has no indexed facets. Each new {@code docId} must be in strictly monotonic (increasing) order. + * + *

NOTE: This method loads the {@code int[] parents} array from the taxonomy index. + * The returned FacetLabels may not be in the same order in which they were indexed.

+ * + * @param docId input docId provided in non-decreasing order + * @return the first or next {@link FacetLabel}, or {@code null} if there are no more + * @throws IOException if {@link TaxonomyReader} has problems getting path for an ordinal + * @throws IllegalArgumentException if docId provided is less than docId supplied in an earlier invocation + * @throws IllegalArgumentException if facetDimension is null + */ + public FacetLabel nextFacetLabel(int docId, String facetDimension) throws IOException { + if (facetDimension == null) { + throw new IllegalArgumentException("Input facet dimension cannot be null"); + } + final int parentOrd = taxoReader.getOrdinal(new FacetLabel(facetDimension)); + if (parentOrd == INVALID_ORDINAL) { + throw new IllegalArgumentException("Category ordinal not found for facet dimension: " + facetDimension); + } + + if (currentDocId != docId) { + if (docId < currentDocId) { + throw new IllegalArgumentException("docs out of order: previous docId=" + currentDocId + + " current docId=" + docId); + } + ordinalsSegmentReader.get(docId, decodedOrds); + currentPos = decodedOrds.offset; + currentDocId = docId; + } + + if (parents == null) { + parents = taxoReader.getParallelTaxonomyArrays().parents(); + } + + int endPos = decodedOrds.offset + decodedOrds.length; + assert currentPos <= endPos; + + for (; currentPos < endPos; ) { + int ord = decodedOrds.ints[currentPos++]; + if (isDescendant(ord, parentOrd) == true) { + return taxoReader.getPath(ord); + } + } + return null; + } + } +} diff --git a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java index a88a080a926..6a7b9ec7d7c 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestCase.java @@ -25,18 +25,23 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.lucene.facet.FacetsCollector.MatchingDocs; import org.apache.lucene.facet.taxonomy.CachedOrdinalsReader; import org.apache.lucene.facet.taxonomy.DocValuesOrdinalsReader; +import org.apache.lucene.facet.taxonomy.FacetLabel; import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts; import org.apache.lucene.facet.taxonomy.OrdinalsReader; import org.apache.lucene.facet.taxonomy.TaxonomyFacetCounts; +import org.apache.lucene.facet.taxonomy.TaxonomyFacetLabels; +import org.apache.lucene.facet.taxonomy.TaxonomyFacetLabels.FacetLabelReader; import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; public abstract class FacetTestCase extends LuceneTestCase { - + public Facets getTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector c) throws IOException { return getTaxonomyFacetCounts(taxoReader, config, c, FacetsConfig.DEFAULT_INDEX_FIELD_NAME); } @@ -56,6 +61,41 @@ public abstract class FacetTestCase extends LuceneTestCase { return facets; } + /** + * Utility method that uses {@link FacetLabelReader} to get facet labels + * for each hit in {@link MatchingDocs}. The method returns {@code List>} + * where outer list has one entry per document and inner list has all {@link FacetLabel} + * entries that belong to a document. The inner list may be empty if no {@link FacetLabel} + * are found for a hit. + * + * @param taxoReader {@link TaxonomyReader} used to read taxonomy during search. This instance is expected to be open for reading. + * @param fc {@link FacetsCollector} A collector with matching hits. + * @return {@code List} where outer list has one non-null entry per document + * and inner list contain all {@link FacetLabel} entries that belong to a document. + * @throws IOException when a low-level IO issue occurs. + */ + public List> getAllTaxonomyFacetLabels(TaxonomyReader taxoReader, FacetsCollector fc) throws IOException { + List> actualLabels = new ArrayList<>(); + TaxonomyFacetLabels taxoLabels = new TaxonomyFacetLabels(taxoReader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME); + + for (MatchingDocs m : fc.getMatchingDocs()) { + FacetLabelReader facetLabelReader = taxoLabels.getFacetLabelReader(m.context); + + DocIdSetIterator disi = m.bits.iterator(); + while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + List facetLabels = new ArrayList<>(); + int docId = disi.docID(); + FacetLabel facetLabel = facetLabelReader.nextFacetLabel(docId); + while (facetLabel != null) { + facetLabels.add(facetLabel); + facetLabel = facetLabelReader.nextFacetLabel(docId); + } + actualLabels.add(facetLabels); + } + } + return actualLabels; + } + protected String[] getRandomTokens(int count) { String[] tokens = new String[count]; for(int i=0;i[] expectedCounts = new HashMap[numDims]; + List> expectedLabels = new ArrayList<>(); + for(int i=0;i(); } - for(TestDoc doc : testDocs) { + for (TestDoc doc : testDocs) { if (doc.content.equals(searchToken)) { - for(int j=0;j facetLabels = new ArrayList<>(); + for (int j = 0; j < numDims; j++) { if (doc.dims[j] != null) { Integer v = expectedCounts[j].get(doc.dims[j]); if (v == null) { @@ -692,8 +696,12 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { } else { expectedCounts[j].put(doc.dims[j], v.intValue() + 1); } + + // Add document facet labels + facetLabels.add(new FacetLabel("dim" + j, doc.dims[j])); } } + expectedLabels.add(facetLabels); } } @@ -711,6 +719,11 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { } } + // Test facet labels for each matching test doc + List> actualLabels = getAllTaxonomyFacetLabels(tr, fc); + assertEquals(expectedLabels.size(), actualLabels.size()); + assertTrue(sortedFacetLabels(expectedLabels).equals(sortedFacetLabels(actualLabels))); + // Sort by highest value, tie break by value: sortFacetResults(expected); @@ -726,6 +739,33 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { IOUtils.close(tw, searcher.getIndexReader(), tr, indexDir, taxoDir); } + private static List> sortedFacetLabels(List> allFacetLabels) { + // sort each inner list since there is no guaranteed order in which FacetLabels + // are expected to be retrieved for each document + for (List facetLabels : allFacetLabels) { + Collections.sort(facetLabels); + } + + Collections.sort(allFacetLabels, (o1, o2) -> { + int diff = o1.size() - o2.size(); + if (diff != 0) { + return diff; + } + + // the lists are equal in size and sorted + for (int i = 0; i < o1.size(); i++) { + int comp = o1.get(i).compareTo(o2.get(i)); + if (comp != 0) { + return comp; + } + } + // all elements are equal + return 0; + }); + + return allFacetLabels; + } + private static Facets getAllFacets(String indexFieldName, IndexSearcher searcher, TaxonomyReader taxoReader, FacetsConfig config) throws IOException { if (random().nextBoolean()) { // Aggregate the facet counts: diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetLabels.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetLabels.java new file mode 100644 index 00000000000..3a271b2bd94 --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetLabels.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet.taxonomy; + +import org.apache.lucene.document.Document; +import org.apache.lucene.facet.FacetField; +import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollector.MatchingDocs; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class TestTaxonomyFacetLabels extends FacetTestCase { + + private List prepareDocuments() { + List docs = new ArrayList<>(); + + Document doc = new Document(); + doc.add(new FacetField("Author", "Bob")); + doc.add(new FacetField("Publish Date", "2010", "10", "15")); + docs.add(doc); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2010", "10", "20")); + docs.add(doc); + + doc = new Document(); + doc.add(new FacetField("Author", "Tom")); + doc.add(new FacetField("Publish Date", "2012", "1", "1")); + docs.add(doc); + + doc = new Document(); + doc.add(new FacetField("Author", "Susan")); + doc.add(new FacetField("Publish Date", "2012", "1", "7")); + docs.add(doc); + + doc = new Document(); + doc.add(new FacetField("Author", "Frank")); + doc.add(new FacetField("Publish Date", "1999", "5", "5")); + docs.add(doc); + + return docs; + } + + private List allDocIds(MatchingDocs m, boolean decreasingDocIds) throws IOException { + DocIdSetIterator disi = m.bits.iterator(); + List docIds = new ArrayList<>(); + while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + docIds.add(disi.docID()); + } + + if (decreasingDocIds == true) { + Collections.reverse(docIds); + } + return docIds; + } + + private List lookupFacetLabels(TaxonomyFacetLabels taxoLabels, + List matchingDocs) throws IOException { + return lookupFacetLabels(taxoLabels, matchingDocs, null, false); + } + + private List lookupFacetLabels(TaxonomyFacetLabels taxoLabels, + List matchingDocs, + String dimension) throws IOException { + return lookupFacetLabels(taxoLabels, matchingDocs, dimension, false); + } + + private List lookupFacetLabels(TaxonomyFacetLabels taxoLabels, List matchingDocs, String dimension, + boolean decreasingDocIds) throws IOException { + List facetLabels = new ArrayList<>(); + + for (MatchingDocs m : matchingDocs) { + TaxonomyFacetLabels.FacetLabelReader facetLabelReader = taxoLabels.getFacetLabelReader(m.context); + List docIds = allDocIds(m, decreasingDocIds); + FacetLabel facetLabel; + for (Integer docId : docIds) { + while (true) { + if (dimension != null) { + facetLabel = facetLabelReader.nextFacetLabel(docId, dimension); + } else { + facetLabel = facetLabelReader.nextFacetLabel(docId); + } + + if (facetLabel == null) { + break; + } + facetLabels.add(facetLabel); + } + } + } + + return facetLabels; + } + + + public void testBasic() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + + // Writes facet ords to a separate directory from the main index: + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + FacetsConfig config = new FacetsConfig(); + config.setHierarchical("Publish Date", true); + + for (Document doc : prepareDocuments()) { + writer.addDocument(config.build(taxoWriter, doc)); + } + + // NRT open + IndexSearcher searcher = newSearcher(writer.getReader()); + // NRT open + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter); + + FacetsCollector fc = new FacetsCollector(); + searcher.search(new MatchAllDocsQuery(), fc); + + TaxonomyFacetLabels taxoLabels = new TaxonomyFacetLabels(taxoReader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME); + + // Check labels for all dimensions + List facetLabels = lookupFacetLabels(taxoLabels, fc.getMatchingDocs()); + assertEquals("Incorrect number of facet labels received", 10, facetLabels.size()); + + // Check labels for all dimensions + assertTrue(facetLabels.stream() + .filter(l -> "Author".equals(l.components[0])) + .map(l -> l.components[1]).collect(Collectors.toSet()) + .equals(Set.of("Bob", "Lisa", "Susan", "Frank", "Tom"))); + + assertTrue(facetLabels.stream() + .filter(l -> "Publish Date".equals(l.components[0])) + .map(l -> String.join("/", l.components[1], l.components[2], l.components[3])) + .collect(Collectors.toSet()) + .equals(Set.of("2010/10/15", "2010/10/20", "2012/1/1", "2012/1/7", "1999/5/5"))); + + // Check labels for a specific dimension + facetLabels = lookupFacetLabels(taxoLabels, fc.getMatchingDocs(), "Publish Date"); + assertEquals("Incorrect number of facet labels received for 'Publish Date'", 5, facetLabels.size()); + + assertTrue(facetLabels.stream() + .map(l -> String.join("/", l.components[1], l.components[2], l.components[3])) + .collect(Collectors.toSet()) + .equals(Set.of("2010/10/15", "2010/10/20", "2012/1/1", "2012/1/7", "1999/5/5"))); + + try { + facetLabels = lookupFacetLabels(taxoLabels, fc.getMatchingDocs(), null, true); + fail("IllegalArgumentException was not thrown for using docIds supplied in decreasing order"); + } catch (IllegalArgumentException ae) { + assertTrue(ae.getMessage().contains("docs out of order")); + } + + try { + facetLabels = lookupFacetLabels(taxoLabels, fc.getMatchingDocs(), "Publish Date", true); + fail("Assertion error was not thrown for using docIds supplied in decreasing order"); + } catch (IllegalArgumentException ae) { + assertTrue(ae.getMessage().contains("docs out of order")); + } + + writer.close(); + IOUtils.close(taxoWriter, searcher.getIndexReader(), taxoReader, taxoDir, dir); + } +}