Add support for storing term vectors in FeatureField (#14034)

This update introduces an option to store term vectors generated by the FeatureField.
With this option, term vectors can be used to access all features for each document.
This commit is contained in:
Jim Ferenczi 2024-12-04 12:31:00 +00:00 committed by GitHub
parent 552b3f52d7
commit 8fdd48b3b3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 102 additions and 2 deletions

View File

@ -47,7 +47,7 @@ API Changes
New Features
---------------------
(No changes)
* GITHUB#14034: Add support for storing term vectors in FeatureField. (Jim Ferenczi)
Improvements
---------------------

View File

@ -105,11 +105,17 @@ import org.apache.lucene.search.similarities.Similarity.SimScorer;
public final class FeatureField extends Field {
private static final FieldType FIELD_TYPE = new FieldType();
private static final FieldType FIELD_TYPE_STORE_TERM_VECTORS = new FieldType();
static {
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
FIELD_TYPE_STORE_TERM_VECTORS.setTokenized(false);
FIELD_TYPE_STORE_TERM_VECTORS.setOmitNorms(true);
FIELD_TYPE_STORE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
FIELD_TYPE_STORE_TERM_VECTORS.setStoreTermVectors(true);
}
private float featureValue;
@ -123,7 +129,21 @@ public final class FeatureField extends Field {
* @param featureValue The value of the feature, must be a positive, finite, normal float.
*/
public FeatureField(String fieldName, String featureName, float featureValue) {
super(fieldName, featureName, FIELD_TYPE);
this(fieldName, featureName, featureValue, false);
}
/**
* Create a feature.
*
* @param fieldName The name of the field to store the information into. All features may be
* stored in the same field.
* @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term.
* @param featureValue The value of the feature, must be a positive, finite, normal float.
* @param storeTermVectors Whether term vectors should be stored.
*/
public FeatureField(
String fieldName, String featureName, float featureValue, boolean storeTermVectors) {
super(fieldName, featureName, storeTermVectors ? FIELD_TYPE_STORE_TERM_VECTORS : FIELD_TYPE);
setFeatureValue(featureValue);
}

View File

@ -16,6 +16,9 @@
*/
package org.apache.lucene.document;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.document.Field.Store;
@ -38,6 +41,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.search.QueryUtils;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
public class TestFeatureField extends LuceneTestCase {
@ -87,6 +91,9 @@ public class TestFeatureField extends LuceneTestCase {
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);
var fieldInfo = context.reader().getFieldInfos().fieldInfo("features");
assertFalse(fieldInfo.hasTermVectors());
Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
Weight w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
Scorer s = w.scorer(context);
@ -445,4 +452,77 @@ public class TestFeatureField extends LuceneTestCase {
reader.close();
}
}
public void testStoreTermVectors() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer =
new RandomIndexWriter(
random(),
dir,
newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean())));
Document doc = new Document();
FeatureField pagerank = new FeatureField("features", "pagerank", 1, true);
FeatureField urlLength = new FeatureField("features", "urlLen", 1, true);
doc.add(pagerank);
doc.add(urlLength);
pagerank.setFeatureValue(10);
urlLength.setFeatureValue(0.5f);
writer.addDocument(doc);
writer.addDocument(new Document()); // gap
pagerank.setFeatureValue(42);
urlLength.setFeatureValue(1.5f);
writer.addDocument(doc);
doc.clear();
FeatureField invalid = new FeatureField("features", "pagerank", 1, false);
doc.add(invalid);
var exc = expectThrows(Exception.class, () -> writer.addDocument(doc));
assertThat(exc.getMessage(), containsString("store term vector"));
writer.forceMerge(1);
DirectoryReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);
var fieldInfo = context.reader().getFieldInfos().fieldInfo("features");
assertTrue(fieldInfo.hasTermVectors());
var terms = context.reader().termVectors().get(0, "features");
var termsEnum = terms.iterator();
assertThat(termsEnum.next(), equalTo(new BytesRef("pagerank")));
var postings = termsEnum.postings(null);
assertThat(postings.nextDoc(), equalTo(0));
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(10f));
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));
assertThat(termsEnum.next(), equalTo(new BytesRef("urlLen")));
postings = termsEnum.postings(postings);
assertThat(postings.nextDoc(), equalTo(0));
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(0.5f));
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));
terms = context.reader().termVectors().get(1, "features");
assertNull(terms);
terms = context.reader().termVectors().get(2, "features");
termsEnum = terms.iterator();
assertThat(termsEnum.next(), equalTo(new BytesRef("pagerank")));
postings = termsEnum.postings(postings);
assertThat(postings.nextDoc(), equalTo(0));
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(42f));
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));
assertThat(termsEnum.next(), equalTo(new BytesRef("urlLen")));
postings = termsEnum.postings(null);
assertThat(postings.nextDoc(), equalTo(0));
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(1.5f));
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));
IOUtils.close(reader, dir);
}
}