From 8fdd48b3b3c4658be800df78c4c63b3cdf1c9d69 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 4 Dec 2024 12:31:00 +0000 Subject: [PATCH] Add support for storing term vectors in FeatureField (#14034) This update introduces an option to store term vectors generated by the FeatureField. With this option, term vectors can be used to access all features for each document. --- lucene/CHANGES.txt | 2 +- .../apache/lucene/document/FeatureField.java | 22 ++++- .../lucene/document/TestFeatureField.java | 80 +++++++++++++++++++ 3 files changed, 102 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f4717e88537..98580d3ea3d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -47,7 +47,7 @@ API Changes New Features --------------------- -(No changes) +* GITHUB#14034: Add support for storing term vectors in FeatureField. (Jim Ferenczi) Improvements --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/document/FeatureField.java b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java index bedd95cf8a5..233063c9da0 100644 --- a/lucene/core/src/java/org/apache/lucene/document/FeatureField.java +++ b/lucene/core/src/java/org/apache/lucene/document/FeatureField.java @@ -105,11 +105,17 @@ import org.apache.lucene.search.similarities.Similarity.SimScorer; public final class FeatureField extends Field { private static final FieldType FIELD_TYPE = new FieldType(); + private static final FieldType FIELD_TYPE_STORE_TERM_VECTORS = new FieldType(); static { FIELD_TYPE.setTokenized(false); FIELD_TYPE.setOmitNorms(true); FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS); + + FIELD_TYPE_STORE_TERM_VECTORS.setTokenized(false); + FIELD_TYPE_STORE_TERM_VECTORS.setOmitNorms(true); + FIELD_TYPE_STORE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS); + FIELD_TYPE_STORE_TERM_VECTORS.setStoreTermVectors(true); } private float featureValue; @@ -123,7 +129,21 @@ public final class FeatureField extends Field { * @param featureValue The value of the feature, must be a positive, finite, normal float. */ public FeatureField(String fieldName, String featureName, float featureValue) { - super(fieldName, featureName, FIELD_TYPE); + this(fieldName, featureName, featureValue, false); + } + + /** + * Create a feature. + * + * @param fieldName The name of the field to store the information into. All features may be + * stored in the same field. + * @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term. + * @param featureValue The value of the feature, must be a positive, finite, normal float. + * @param storeTermVectors Whether term vectors should be stored. + */ + public FeatureField( + String fieldName, String featureName, float featureValue, boolean storeTermVectors) { + super(fieldName, featureName, storeTermVectors ? FIELD_TYPE_STORE_TERM_VECTORS : FIELD_TYPE); setFeatureValue(featureValue); } diff --git a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java index 3bccc17c0a6..33918c4d8dc 100644 --- a/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java +++ b/lucene/core/src/test/org/apache/lucene/document/TestFeatureField.java @@ -16,6 +16,9 @@ */ package org.apache.lucene.document; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; + import java.io.IOException; import java.util.List; import org.apache.lucene.document.Field.Store; @@ -38,6 +41,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.search.QueryUtils; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; public class TestFeatureField extends LuceneTestCase { @@ -87,6 +91,9 @@ public class TestFeatureField extends LuceneTestCase { IndexSearcher searcher = LuceneTestCase.newSearcher(reader); LeafReaderContext context = searcher.getIndexReader().leaves().get(0); + var fieldInfo = context.reader().getFieldInfos().fieldInfo("features"); + assertFalse(fieldInfo.hasTermVectors()); + Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f); Weight w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2); Scorer s = w.scorer(context); @@ -445,4 +452,77 @@ public class TestFeatureField extends LuceneTestCase { reader.close(); } } + + public void testStoreTermVectors() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter writer = + new RandomIndexWriter( + random(), + dir, + newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean()))); + Document doc = new Document(); + FeatureField pagerank = new FeatureField("features", "pagerank", 1, true); + FeatureField urlLength = new FeatureField("features", "urlLen", 1, true); + doc.add(pagerank); + doc.add(urlLength); + + pagerank.setFeatureValue(10); + urlLength.setFeatureValue(0.5f); + writer.addDocument(doc); + + writer.addDocument(new Document()); // gap + + pagerank.setFeatureValue(42); + urlLength.setFeatureValue(1.5f); + writer.addDocument(doc); + + doc.clear(); + FeatureField invalid = new FeatureField("features", "pagerank", 1, false); + doc.add(invalid); + var exc = expectThrows(Exception.class, () -> writer.addDocument(doc)); + assertThat(exc.getMessage(), containsString("store term vector")); + + writer.forceMerge(1); + DirectoryReader reader = writer.getReader(); + writer.close(); + + IndexSearcher searcher = LuceneTestCase.newSearcher(reader); + LeafReaderContext context = searcher.getIndexReader().leaves().get(0); + + var fieldInfo = context.reader().getFieldInfos().fieldInfo("features"); + assertTrue(fieldInfo.hasTermVectors()); + + var terms = context.reader().termVectors().get(0, "features"); + var termsEnum = terms.iterator(); + assertThat(termsEnum.next(), equalTo(new BytesRef("pagerank"))); + var postings = termsEnum.postings(null); + assertThat(postings.nextDoc(), equalTo(0)); + assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(10f)); + assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS)); + + assertThat(termsEnum.next(), equalTo(new BytesRef("urlLen"))); + postings = termsEnum.postings(postings); + assertThat(postings.nextDoc(), equalTo(0)); + assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(0.5f)); + assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS)); + + terms = context.reader().termVectors().get(1, "features"); + assertNull(terms); + + terms = context.reader().termVectors().get(2, "features"); + termsEnum = terms.iterator(); + assertThat(termsEnum.next(), equalTo(new BytesRef("pagerank"))); + postings = termsEnum.postings(postings); + assertThat(postings.nextDoc(), equalTo(0)); + assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(42f)); + assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS)); + + assertThat(termsEnum.next(), equalTo(new BytesRef("urlLen"))); + postings = termsEnum.postings(null); + assertThat(postings.nextDoc(), equalTo(0)); + assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(1.5f)); + assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS)); + + IOUtils.close(reader, dir); + } }