mirror of https://github.com/apache/lucene.git
Index arbitrary fields in taxonomy docs (#12337)
This commit is contained in:
parent
bff5ac0ed0
commit
f339e24e8e
|
@ -197,6 +197,9 @@ New Features
|
||||||
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
|
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
|
||||||
level. (Aditya Prakash, Kaival Parikh)
|
level. (Aditya Prakash, Kaival Parikh)
|
||||||
|
|
||||||
|
* GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless,
|
||||||
|
Stefan Vodita)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
|
|
@ -17,9 +17,14 @@
|
||||||
package org.apache.lucene.facet.taxonomy.directory;
|
package org.apache.lucene.facet.taxonomy.directory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* This class holds constants used by the directory taxonomy implementations.
|
||||||
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
abstract class Consts {
|
abstract class Consts {
|
||||||
|
/** The name of the field containing the full path of a taxonomy document. */
|
||||||
static final String FULL = "$full_path$";
|
static final String FULL = "$full_path$";
|
||||||
|
|
||||||
|
/** The name of the field containing the ordinal of the parent of a taxonomy document. */
|
||||||
static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$";
|
static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$";
|
||||||
}
|
}
|
||||||
|
|
|
@ -243,7 +243,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
|
||||||
* Expert: returns the underlying {@link DirectoryReader} instance that is used by this {@link
|
* Expert: returns the underlying {@link DirectoryReader} instance that is used by this {@link
|
||||||
* TaxonomyReader}.
|
* TaxonomyReader}.
|
||||||
*/
|
*/
|
||||||
protected DirectoryReader getInternalIndexReader() {
|
public DirectoryReader getInternalIndexReader() {
|
||||||
ensureOpen();
|
ensureOpen();
|
||||||
return indexReader;
|
return indexReader;
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,7 @@ import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
import org.apache.lucene.document.BinaryDocValuesField;
|
import org.apache.lucene.document.BinaryDocValuesField;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
@ -436,6 +437,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Child classes can implement this method to modify the document corresponding to a category path
|
||||||
|
* before indexing it.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
protected void enrichOrdinalDocument(Document d, FacetLabel categoryPath) {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Note that the methods calling addCategoryDocument() are synchronized, so this method is
|
* Note that the methods calling addCategoryDocument() are synchronized, so this method is
|
||||||
* effectively synchronized as well.
|
* effectively synchronized as well.
|
||||||
|
@ -453,6 +462,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
||||||
|
|
||||||
d.add(fullPathField);
|
d.add(fullPathField);
|
||||||
|
|
||||||
|
// add arbitrary ordinal data to the doc
|
||||||
|
enrichOrdinalDocument(d, categoryPath);
|
||||||
|
|
||||||
indexWriter.addDocument(d);
|
indexWriter.addDocument(d);
|
||||||
int id = nextID.getAndIncrement();
|
int id = nextID.getAndIncrement();
|
||||||
|
|
||||||
|
@ -878,6 +890,34 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
||||||
++indexEpoch;
|
++indexEpoch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete the taxonomy and reset all state for this writer.
|
||||||
|
*
|
||||||
|
* <p>To keep using the same main index, you would have to regenerate the taxonomy, taking care
|
||||||
|
* that ordinals are indexed in the same order as before. An example of this can be found in
|
||||||
|
* {@link ReindexingEnrichedDirectoryTaxonomyWriter#reindexWithNewOrdinalData(BiConsumer)}.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
synchronized void deleteAll() throws IOException {
|
||||||
|
indexWriter.deleteAll();
|
||||||
|
shouldRefreshReaderManager = true;
|
||||||
|
initReaderManager(); // ensure that it's initialized
|
||||||
|
refreshReaderManager();
|
||||||
|
nextID.set(0);
|
||||||
|
taxoArrays = null; // must nullify so that it's re-computed next time it's needed
|
||||||
|
|
||||||
|
// need to clear the cache, so that addCategory won't accidentally return
|
||||||
|
// old categories that are in the cache.
|
||||||
|
cache.clear();
|
||||||
|
cacheIsComplete = false;
|
||||||
|
shouldFillCache = true;
|
||||||
|
cacheMisses.set(0);
|
||||||
|
|
||||||
|
// update indexEpoch as a taxonomy replace is just like it has be recreated
|
||||||
|
++indexEpoch;
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns the {@link Directory} of this taxonomy writer. */
|
/** Returns the {@link Directory} of this taxonomy writer. */
|
||||||
public Directory getDirectory() {
|
public Directory getDirectory() {
|
||||||
return dir;
|
return dir;
|
||||||
|
|
|
@ -0,0 +1,105 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.facet.taxonomy.directory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.LeafReader;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use this {@link org.apache.lucene.facet.taxonomy.TaxonomyWriter} to append arbitrary fields to
|
||||||
|
* the ordinal documents in the taxonomy. To update the custom data added to the docs, it is
|
||||||
|
* required to {@link #reindexWithNewOrdinalData(BiConsumer)}.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class ReindexingEnrichedDirectoryTaxonomyWriter extends DirectoryTaxonomyWriter {
|
||||||
|
private BiConsumer<FacetLabel, Document> ordinalDataAppender;
|
||||||
|
|
||||||
|
/** Create a taxonomy writer that will allow editing the ordinal docs before indexing them. */
|
||||||
|
public ReindexingEnrichedDirectoryTaxonomyWriter(
|
||||||
|
Directory d, BiConsumer<FacetLabel, Document> ordinalDataAppender) throws IOException {
|
||||||
|
super(d);
|
||||||
|
this.ordinalDataAppender = ordinalDataAppender;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Add fields specified by the {@link #ordinalDataAppender} to the provided {@link Document}. */
|
||||||
|
@Override
|
||||||
|
protected void enrichOrdinalDocument(Document d, FacetLabel categoryPath) {
|
||||||
|
if (ordinalDataAppender != null) {
|
||||||
|
ordinalDataAppender.accept(categoryPath, d);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Make a list of all labels in the taxonomy. The index of each label in this list is the ordinal
|
||||||
|
* which corresponds to it.
|
||||||
|
*/
|
||||||
|
private List<FacetLabel> recordPathsInOrder(Directory d) throws IOException {
|
||||||
|
List<FacetLabel> paths = new ArrayList<>();
|
||||||
|
|
||||||
|
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(d);
|
||||||
|
IndexReader taxoIndexReader = taxoReader.getInternalIndexReader();
|
||||||
|
|
||||||
|
for (LeafReaderContext ctx : taxoIndexReader.leaves()) {
|
||||||
|
LeafReader leafReader = ctx.reader();
|
||||||
|
int[] ordinals = new int[leafReader.maxDoc()];
|
||||||
|
for (int i = 0; i < ordinals.length; i++) {
|
||||||
|
ordinals[i] = ctx.docBase + i;
|
||||||
|
}
|
||||||
|
FacetLabel[] labels = taxoReader.getBulkPath(ordinals);
|
||||||
|
for (FacetLabel label : labels) {
|
||||||
|
paths.add(label);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
IOUtils.close(taxoReader);
|
||||||
|
return paths;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete the existing taxonomy index and recreate it using new ordinal data. The ordinals
|
||||||
|
* themselves will be preserved, so the caller does not need to update references to them in the
|
||||||
|
* main index.
|
||||||
|
*/
|
||||||
|
public synchronized void reindexWithNewOrdinalData(
|
||||||
|
BiConsumer<FacetLabel, Document> ordinalDataAppender) throws IOException {
|
||||||
|
ensureOpen();
|
||||||
|
this.ordinalDataAppender = ordinalDataAppender;
|
||||||
|
Directory d = getDirectory();
|
||||||
|
|
||||||
|
// Record paths in order.
|
||||||
|
List<FacetLabel> ordinalToPath = recordPathsInOrder(d);
|
||||||
|
|
||||||
|
// Delete old taxonomy files.
|
||||||
|
deleteAll();
|
||||||
|
|
||||||
|
// Index paths in order - they will use the new appender.
|
||||||
|
for (FacetLabel categoryPath : ordinalToPath) {
|
||||||
|
addCategory(categoryPath);
|
||||||
|
}
|
||||||
|
commit();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,186 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.facet.taxonomy;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.NumericDocValuesField;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.facet.FacetField;
|
||||||
|
import org.apache.lucene.facet.FacetTestCase;
|
||||||
|
import org.apache.lucene.facet.FacetsConfig;
|
||||||
|
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||||
|
import org.apache.lucene.facet.taxonomy.directory.ReindexingEnrichedDirectoryTaxonomyWriter;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
import org.apache.lucene.index.LeafReader;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
|
|
||||||
|
public class TestOrdinalData extends FacetTestCase {
|
||||||
|
Directory taxoDir;
|
||||||
|
DirectoryTaxonomyReader taxoReader;
|
||||||
|
IndexReader taxoIndexReader;
|
||||||
|
ReindexingEnrichedDirectoryTaxonomyWriter taxoWriter;
|
||||||
|
|
||||||
|
private static final Map<String, Long> labelToScore =
|
||||||
|
Map.of(
|
||||||
|
"Bob", 42L,
|
||||||
|
"Lisa", 35L);
|
||||||
|
|
||||||
|
private static class OrdinalDataAppender implements BiConsumer<FacetLabel, Document> {
|
||||||
|
private final Map<String, Long> scores;
|
||||||
|
|
||||||
|
private OrdinalDataAppender(Map<String, Long> scores) {
|
||||||
|
this.scores = scores;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void accept(FacetLabel facetLabel, Document doc) {
|
||||||
|
if (facetLabel.length == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Long score = scores.get(facetLabel.components[facetLabel.length - 1]);
|
||||||
|
if (score != null) {
|
||||||
|
doc.add(new NumericDocValuesField("score", score));
|
||||||
|
doc.add(new StringField("hasScore?", "yes", Field.Store.NO));
|
||||||
|
} else {
|
||||||
|
doc.add(new StringField("hasScore?", "no", Field.Store.NO));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
|
||||||
|
Directory indexDir = newDirectory();
|
||||||
|
taxoDir = newDirectory();
|
||||||
|
|
||||||
|
IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig());
|
||||||
|
taxoWriter =
|
||||||
|
new ReindexingEnrichedDirectoryTaxonomyWriter(
|
||||||
|
taxoDir, new OrdinalDataAppender(labelToScore));
|
||||||
|
|
||||||
|
FacetsConfig facetsConfig = new FacetsConfig();
|
||||||
|
facetsConfig.setHierarchical("Author", true);
|
||||||
|
facetsConfig.setMultiValued("Author", true);
|
||||||
|
facetsConfig.setHierarchical("Publish Date", true);
|
||||||
|
facetsConfig.setMultiValued("Publish Date", true);
|
||||||
|
|
||||||
|
Document doc;
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new FacetField("Author", "Bob"));
|
||||||
|
doc.add(new FacetField("Publish Date", "2010", "10", "15"));
|
||||||
|
indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new FacetField("Author", "Lisa"));
|
||||||
|
doc.add(new FacetField("Publish Date", "2010", "10", "20"));
|
||||||
|
indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
|
||||||
|
|
||||||
|
IOUtils.close(indexWriter, indexDir);
|
||||||
|
taxoWriter.commit();
|
||||||
|
|
||||||
|
taxoReader = new DirectoryTaxonomyReader(taxoDir);
|
||||||
|
taxoIndexReader = taxoReader.getInternalIndexReader();
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
super.tearDown();
|
||||||
|
|
||||||
|
IOUtils.close(taxoWriter);
|
||||||
|
IOUtils.close(taxoReader);
|
||||||
|
IOUtils.close(taxoDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDocValue() throws IOException {
|
||||||
|
// Each unique label will have been assigned a doc.
|
||||||
|
// Additionally, we have the root node of the taxonomy.
|
||||||
|
assertEquals(9, taxoIndexReader.maxDoc());
|
||||||
|
for (LeafReaderContext ctx : taxoIndexReader.leaves()) {
|
||||||
|
LeafReader leafReader = ctx.reader();
|
||||||
|
NumericDocValues scores = leafReader.getNumericDocValues("score");
|
||||||
|
if (scores == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (int ord = 0; ord < leafReader.maxDoc(); ord++) {
|
||||||
|
if (scores.advanceExact(ord) == false) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
FacetLabel label = taxoReader.getPath(ctx.docBase + ord);
|
||||||
|
Long score = labelToScore.get(label.components[label.length - 1]);
|
||||||
|
if (score == null) {
|
||||||
|
throw new IOException("Unexpected score for " + Arrays.toString(label.components));
|
||||||
|
}
|
||||||
|
assertEquals((long) score, scores.longValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void validateSearchResults(IndexSearcher searcher, Map<Query, Integer> queriesAndCounts)
|
||||||
|
throws IOException {
|
||||||
|
for (Map.Entry<Query, Integer> queryAndCount : queriesAndCounts.entrySet()) {
|
||||||
|
Query q = queryAndCount.getKey();
|
||||||
|
int count = queryAndCount.getValue();
|
||||||
|
TopDocs td = searcher.search(q, Integer.MAX_VALUE);
|
||||||
|
assertEquals(count, td.totalHits.value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSearchableField() throws IOException {
|
||||||
|
IndexSearcher taxoSearcher = newSearcher(taxoIndexReader);
|
||||||
|
validateSearchResults(
|
||||||
|
taxoSearcher,
|
||||||
|
Map.of(
|
||||||
|
new TermQuery(new Term("hasScore?", "yes")), 2,
|
||||||
|
new TermQuery(new Term("hasScore?", "no")), 6));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testReindex() throws IOException {
|
||||||
|
taxoWriter.reindexWithNewOrdinalData(new OrdinalDataAppender(new HashMap<>()));
|
||||||
|
taxoReader.close();
|
||||||
|
taxoReader = new DirectoryTaxonomyReader(taxoDir);
|
||||||
|
taxoIndexReader = taxoReader.getInternalIndexReader();
|
||||||
|
|
||||||
|
IndexSearcher taxoSearcher = newSearcher(taxoIndexReader);
|
||||||
|
validateSearchResults(
|
||||||
|
taxoSearcher,
|
||||||
|
Map.of(
|
||||||
|
new TermQuery(new Term("hasScore?", "yes")), 0,
|
||||||
|
new TermQuery(new Term("hasScore?", "no")), 8));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue