mirror of https://github.com/apache/lucene.git
Add support for storing term vectors in FeatureField (#14034)
This update introduces an option to store term vectors generated by the FeatureField. With this option, term vectors can be used to access all features for each document.
This commit is contained in:
parent
c9c631f9d2
commit
356a534c0b
|
@ -20,7 +20,7 @@ API Changes
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
---------------------
|
---------------------
|
||||||
(No changes)
|
* GITHUB#14034: Add support for storing term vectors in FeatureField. (Jim Ferenczi)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
---------------------
|
---------------------
|
||||||
|
|
|
@ -105,11 +105,17 @@ import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||||
public final class FeatureField extends Field {
|
public final class FeatureField extends Field {
|
||||||
|
|
||||||
private static final FieldType FIELD_TYPE = new FieldType();
|
private static final FieldType FIELD_TYPE = new FieldType();
|
||||||
|
private static final FieldType FIELD_TYPE_STORE_TERM_VECTORS = new FieldType();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
FIELD_TYPE.setTokenized(false);
|
FIELD_TYPE.setTokenized(false);
|
||||||
FIELD_TYPE.setOmitNorms(true);
|
FIELD_TYPE.setOmitNorms(true);
|
||||||
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
|
||||||
|
FIELD_TYPE_STORE_TERM_VECTORS.setTokenized(false);
|
||||||
|
FIELD_TYPE_STORE_TERM_VECTORS.setOmitNorms(true);
|
||||||
|
FIELD_TYPE_STORE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
|
||||||
|
FIELD_TYPE_STORE_TERM_VECTORS.setStoreTermVectors(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private float featureValue;
|
private float featureValue;
|
||||||
|
@ -123,7 +129,21 @@ public final class FeatureField extends Field {
|
||||||
* @param featureValue The value of the feature, must be a positive, finite, normal float.
|
* @param featureValue The value of the feature, must be a positive, finite, normal float.
|
||||||
*/
|
*/
|
||||||
public FeatureField(String fieldName, String featureName, float featureValue) {
|
public FeatureField(String fieldName, String featureName, float featureValue) {
|
||||||
super(fieldName, featureName, FIELD_TYPE);
|
this(fieldName, featureName, featureValue, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a feature.
|
||||||
|
*
|
||||||
|
* @param fieldName The name of the field to store the information into. All features may be
|
||||||
|
* stored in the same field.
|
||||||
|
* @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term.
|
||||||
|
* @param featureValue The value of the feature, must be a positive, finite, normal float.
|
||||||
|
* @param storeTermVectors Whether term vectors should be stored.
|
||||||
|
*/
|
||||||
|
public FeatureField(
|
||||||
|
String fieldName, String featureName, float featureValue, boolean storeTermVectors) {
|
||||||
|
super(fieldName, featureName, storeTermVectors ? FIELD_TYPE_STORE_TERM_VECTORS : FIELD_TYPE);
|
||||||
setFeatureValue(featureValue);
|
setFeatureValue(featureValue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,9 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.document;
|
package org.apache.lucene.document;
|
||||||
|
|
||||||
|
import static org.hamcrest.Matchers.containsString;
|
||||||
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.apache.lucene.document.Field.Store;
|
import org.apache.lucene.document.Field.Store;
|
||||||
|
@ -38,6 +41,7 @@ import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.tests.index.RandomIndexWriter;
|
import org.apache.lucene.tests.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.tests.search.QueryUtils;
|
import org.apache.lucene.tests.search.QueryUtils;
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
public class TestFeatureField extends LuceneTestCase {
|
public class TestFeatureField extends LuceneTestCase {
|
||||||
|
@ -87,6 +91,9 @@ public class TestFeatureField extends LuceneTestCase {
|
||||||
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
||||||
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);
|
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);
|
||||||
|
|
||||||
|
var fieldInfo = context.reader().getFieldInfos().fieldInfo("features");
|
||||||
|
assertFalse(fieldInfo.hasTermVectors());
|
||||||
|
|
||||||
Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
|
Query q = FeatureField.newLogQuery("features", "pagerank", 3f, 4.5f);
|
||||||
Weight w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
|
Weight w = q.createWeight(searcher, ScoreMode.TOP_SCORES, 2);
|
||||||
Scorer s = w.scorer(context);
|
Scorer s = w.scorer(context);
|
||||||
|
@ -445,4 +452,77 @@ public class TestFeatureField extends LuceneTestCase {
|
||||||
reader.close();
|
reader.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testStoreTermVectors() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer =
|
||||||
|
new RandomIndexWriter(
|
||||||
|
random(),
|
||||||
|
dir,
|
||||||
|
newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean())));
|
||||||
|
Document doc = new Document();
|
||||||
|
FeatureField pagerank = new FeatureField("features", "pagerank", 1, true);
|
||||||
|
FeatureField urlLength = new FeatureField("features", "urlLen", 1, true);
|
||||||
|
doc.add(pagerank);
|
||||||
|
doc.add(urlLength);
|
||||||
|
|
||||||
|
pagerank.setFeatureValue(10);
|
||||||
|
urlLength.setFeatureValue(0.5f);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
writer.addDocument(new Document()); // gap
|
||||||
|
|
||||||
|
pagerank.setFeatureValue(42);
|
||||||
|
urlLength.setFeatureValue(1.5f);
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
doc.clear();
|
||||||
|
FeatureField invalid = new FeatureField("features", "pagerank", 1, false);
|
||||||
|
doc.add(invalid);
|
||||||
|
var exc = expectThrows(Exception.class, () -> writer.addDocument(doc));
|
||||||
|
assertThat(exc.getMessage(), containsString("store term vector"));
|
||||||
|
|
||||||
|
writer.forceMerge(1);
|
||||||
|
DirectoryReader reader = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
IndexSearcher searcher = LuceneTestCase.newSearcher(reader);
|
||||||
|
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);
|
||||||
|
|
||||||
|
var fieldInfo = context.reader().getFieldInfos().fieldInfo("features");
|
||||||
|
assertTrue(fieldInfo.hasTermVectors());
|
||||||
|
|
||||||
|
var terms = context.reader().termVectors().get(0, "features");
|
||||||
|
var termsEnum = terms.iterator();
|
||||||
|
assertThat(termsEnum.next(), equalTo(new BytesRef("pagerank")));
|
||||||
|
var postings = termsEnum.postings(null);
|
||||||
|
assertThat(postings.nextDoc(), equalTo(0));
|
||||||
|
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(10f));
|
||||||
|
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));
|
||||||
|
|
||||||
|
assertThat(termsEnum.next(), equalTo(new BytesRef("urlLen")));
|
||||||
|
postings = termsEnum.postings(postings);
|
||||||
|
assertThat(postings.nextDoc(), equalTo(0));
|
||||||
|
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(0.5f));
|
||||||
|
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));
|
||||||
|
|
||||||
|
terms = context.reader().termVectors().get(1, "features");
|
||||||
|
assertNull(terms);
|
||||||
|
|
||||||
|
terms = context.reader().termVectors().get(2, "features");
|
||||||
|
termsEnum = terms.iterator();
|
||||||
|
assertThat(termsEnum.next(), equalTo(new BytesRef("pagerank")));
|
||||||
|
postings = termsEnum.postings(postings);
|
||||||
|
assertThat(postings.nextDoc(), equalTo(0));
|
||||||
|
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(42f));
|
||||||
|
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));
|
||||||
|
|
||||||
|
assertThat(termsEnum.next(), equalTo(new BytesRef("urlLen")));
|
||||||
|
postings = termsEnum.postings(null);
|
||||||
|
assertThat(postings.nextDoc(), equalTo(0));
|
||||||
|
assertThat(FeatureField.decodeFeatureValue(postings.freq()), equalTo(1.5f));
|
||||||
|
assertThat(postings.nextDoc(), equalTo(DocIdSetIterator.NO_MORE_DOCS));
|
||||||
|
|
||||||
|
IOUtils.close(reader, dir);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue