mirror of https://github.com/apache/lucene.git
LUCENE-8815: Adds a DoubleValues implementation for feature fields (#687)
This change adds a static method FeatureField#newDoubleValues() which can be used to retrieved the values of a feature for documents directly rathert than having to store the values in a numeric field alongsidde the feature field.
This commit is contained in:
parent
97ca9df7ef
commit
5ef2b3f6b8
|
@ -35,6 +35,11 @@ Other
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
|
* LUCENE-8815: Provide a DoubleValues implementation for retrieving the value of features without
|
||||||
|
requiring a separate numeric field. Note that as feature values are stored with only 8 bits of
|
||||||
|
mantissa the values returned may have a delta from the original values indexed.
|
||||||
|
(Colin Goodheart-Smithe via Adrien Grand)
|
||||||
|
|
||||||
* LUCENE-8803: Provide a FeatureSortfield to allow sorting search hits by descending value of a
|
* LUCENE-8803: Provide a FeatureSortfield to allow sorting search hits by descending value of a
|
||||||
feature. This is exposed via the factory method FeatureField#newFeatureSort.
|
feature. This is exposed via the factory method FeatureField#newFeatureSort.
|
||||||
(Colin Goodheart-Smithe via Adrien Grand)
|
(Colin Goodheart-Smithe via Adrien Grand)
|
||||||
|
|
|
@ -0,0 +1,132 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.document;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
|
import org.apache.lucene.index.Terms;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
|
import org.apache.lucene.search.DoubleValues;
|
||||||
|
import org.apache.lucene.search.DoubleValuesSource;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link DoubleValuesSource} instance which can be used to read the values of a feature from a
|
||||||
|
* {@link FeatureField} for documents.
|
||||||
|
*/
|
||||||
|
class FeatureDoubleValuesSource extends DoubleValuesSource {
|
||||||
|
|
||||||
|
private final BytesRef featureName;
|
||||||
|
private final String field;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link DoubleValuesSource} instance which can be used to read the values of a feature from the a
|
||||||
|
* {@link FeatureField} for documents.
|
||||||
|
*
|
||||||
|
* @param field field name. Must not be null.
|
||||||
|
* @param featureName feature name. Must not be null.
|
||||||
|
* @throws NullPointerException if {@code field} or {@code featureName} is null.
|
||||||
|
*/
|
||||||
|
public FeatureDoubleValuesSource(String field, String featureName) {
|
||||||
|
this.field = Objects.requireNonNull(field);
|
||||||
|
this.featureName = new BytesRef(Objects.requireNonNull(featureName));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isCacheable(LeafReaderContext ctx) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
|
||||||
|
Terms terms = ctx.reader().terms(field);
|
||||||
|
if (terms == null) {
|
||||||
|
return DoubleValues.EMPTY;
|
||||||
|
} else {
|
||||||
|
TermsEnum termsEnum = terms.iterator();
|
||||||
|
if (termsEnum.seekExact(featureName) == false) {
|
||||||
|
return DoubleValues.EMPTY;
|
||||||
|
} else {
|
||||||
|
PostingsEnum currentReaderPostingsValues = termsEnum.postings(null, PostingsEnum.FREQS);
|
||||||
|
return new FeatureDoubleValues(currentReaderPostingsValues);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean needsScores() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DoubleValuesSource rewrite(IndexSearcher reader) throws IOException {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(field, featureName);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (obj == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (obj.getClass() != getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
FeatureDoubleValuesSource other = (FeatureDoubleValuesSource) obj;
|
||||||
|
return Objects.equals(field, other.field) &&
|
||||||
|
Objects.equals(featureName, other.featureName);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "FeatureDoubleValuesSource("+field+", "+featureName.utf8ToString()+")";
|
||||||
|
}
|
||||||
|
|
||||||
|
static class FeatureDoubleValues extends DoubleValues {
|
||||||
|
|
||||||
|
private final PostingsEnum currentReaderPostingsValues;
|
||||||
|
|
||||||
|
public FeatureDoubleValues(PostingsEnum currentReaderPostingsValues) throws IOException {
|
||||||
|
this.currentReaderPostingsValues = currentReaderPostingsValues;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double doubleValue() throws IOException {
|
||||||
|
return FeatureField.decodeFeatureValue(currentReaderPostingsValues.freq());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean advanceExact(int doc) throws IOException {
|
||||||
|
if (doc >= currentReaderPostingsValues.docID()
|
||||||
|
&& (currentReaderPostingsValues.docID() == doc || currentReaderPostingsValues.advance(doc) == doc)) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermStates;
|
import org.apache.lucene.index.TermStates;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.BoostQuery;
|
import org.apache.lucene.search.BoostQuery;
|
||||||
|
import org.apache.lucene.search.DoubleValuesSource;
|
||||||
import org.apache.lucene.search.Explanation;
|
import org.apache.lucene.search.Explanation;
|
||||||
import org.apache.lucene.search.FieldDoc;
|
import org.apache.lucene.search.FieldDoc;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
@ -538,4 +539,17 @@ public final class FeatureField extends Field {
|
||||||
public static SortField newFeatureSort(String field, String featureName) {
|
public static SortField newFeatureSort(String field, String featureName) {
|
||||||
return new FeatureSortField(field, featureName);
|
return new FeatureSortField(field, featureName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a {@link DoubleValuesSource} instance which can be used to read the values of a feature from the a
|
||||||
|
* {@link FeatureField} for documents.
|
||||||
|
*
|
||||||
|
* @param field field name. Must not be null.
|
||||||
|
* @param featureName feature name. Must not be null.
|
||||||
|
* @return a {@link DoubleValuesSource} which can be used to access the values of the feature for documents
|
||||||
|
* @throws NullPointerException if {@code field} or {@code featureName} is null.
|
||||||
|
*/
|
||||||
|
public static DoubleValuesSource newDoubleValues(String field, String featureName) {
|
||||||
|
return new FeatureDoubleValuesSource(field, featureName);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,248 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.document;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.search.DoubleValues;
|
||||||
|
import org.apache.lucene.search.DoubleValuesSource;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test for retrieving values from a feature using a FeatureDoubleValuesSource.
|
||||||
|
*
|
||||||
|
* THE RULES:
|
||||||
|
* 1. keywords like 'abstract' and 'static' should not appear in this file.
|
||||||
|
* 2. each test method should be self-contained and understandable.
|
||||||
|
* 3. no test methods should share code with other test methods.
|
||||||
|
* 4. no testing of things unrelated to sorting.
|
||||||
|
* 5. no tracers.
|
||||||
|
* 6. keyword 'class' should appear only once in this file, here ----
|
||||||
|
* |
|
||||||
|
* -----------------------------------------------------------
|
||||||
|
* |
|
||||||
|
* \./
|
||||||
|
*/
|
||||||
|
public class TestFeatureDoubleValues extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testFeature() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new FeatureField("field", "name", 30F));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new FeatureField("field", "name", 1F));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new FeatureField("field", "name", 4F));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
IndexReader ir = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
assertEquals(1, ir.leaves().size());
|
||||||
|
LeafReaderContext context = ir.leaves().get(0);
|
||||||
|
DoubleValuesSource valuesSource = FeatureField.newDoubleValues("field", "name");
|
||||||
|
DoubleValues values = valuesSource.getValues(context, null);
|
||||||
|
|
||||||
|
assertTrue(values.advanceExact(0));
|
||||||
|
assertEquals(30, values.doubleValue(), 0f);
|
||||||
|
assertTrue(values.advanceExact(1));
|
||||||
|
assertEquals(1, values.doubleValue(), 0f);
|
||||||
|
assertTrue(values.advanceExact(2));
|
||||||
|
assertEquals(4, values.doubleValue(), 0f);
|
||||||
|
assertFalse(values.advanceExact(3));
|
||||||
|
|
||||||
|
ir.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFeatureMissing() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
writer.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new FeatureField("field", "name", 1F));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new FeatureField("field", "name", 4F));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
IndexReader ir = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
assertEquals(1, ir.leaves().size());
|
||||||
|
LeafReaderContext context = ir.leaves().get(0);
|
||||||
|
DoubleValuesSource valuesSource = FeatureField.newDoubleValues("field", "name");
|
||||||
|
DoubleValues values = valuesSource.getValues(context, null);
|
||||||
|
|
||||||
|
assertFalse(values.advanceExact(0));
|
||||||
|
assertTrue(values.advanceExact(1));
|
||||||
|
assertEquals(1, values.doubleValue(), 0f);
|
||||||
|
assertTrue(values.advanceExact(2));
|
||||||
|
assertEquals(4, values.doubleValue(), 0f);
|
||||||
|
assertFalse(values.advanceExact(3));
|
||||||
|
|
||||||
|
ir.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFeatureMissingFieldInSegment() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.commit();
|
||||||
|
IndexReader ir = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
assertEquals(1, ir.leaves().size());
|
||||||
|
LeafReaderContext context = ir.leaves().get(0);
|
||||||
|
DoubleValuesSource valuesSource = FeatureField.newDoubleValues("field", "name");
|
||||||
|
DoubleValues values = valuesSource.getValues(context, null);
|
||||||
|
|
||||||
|
assertFalse(values.advanceExact(0));
|
||||||
|
assertFalse(values.advanceExact(1));
|
||||||
|
|
||||||
|
ir.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFeatureMissingFeatureNameInSegment() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new FeatureField("field", "different_name", 0.5F));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.commit();
|
||||||
|
IndexReader ir = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
assertEquals(1, ir.leaves().size());
|
||||||
|
LeafReaderContext context = ir.leaves().get(0);
|
||||||
|
DoubleValuesSource valuesSource = FeatureField.newDoubleValues("field", "name");
|
||||||
|
DoubleValues values = valuesSource.getValues(context, null);
|
||||||
|
|
||||||
|
assertFalse(values.advanceExact(0));
|
||||||
|
assertFalse(values.advanceExact(1));
|
||||||
|
|
||||||
|
ir.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFeatureMultipleMissing() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||||
|
Document doc = new Document();
|
||||||
|
writer.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
writer.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
writer.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
writer.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
writer.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new FeatureField("field", "name", 1F));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new FeatureField("field", "name", 4F));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
IndexReader ir = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
assertEquals(1, ir.leaves().size());
|
||||||
|
LeafReaderContext context = ir.leaves().get(0);
|
||||||
|
DoubleValuesSource valuesSource = FeatureField.newDoubleValues("field", "name");
|
||||||
|
DoubleValues values = valuesSource.getValues(context, null);
|
||||||
|
|
||||||
|
assertFalse(values.advanceExact(0));
|
||||||
|
assertFalse(values.advanceExact(1));
|
||||||
|
assertFalse(values.advanceExact(2));
|
||||||
|
assertFalse(values.advanceExact(3));
|
||||||
|
assertFalse(values.advanceExact(4));
|
||||||
|
assertTrue(values.advanceExact(5));
|
||||||
|
assertEquals(1, values.doubleValue(), 0f);
|
||||||
|
assertTrue(values.advanceExact(6));
|
||||||
|
assertEquals(4, values.doubleValue(), 0f);
|
||||||
|
assertFalse(values.advanceExact(7));
|
||||||
|
|
||||||
|
ir.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testHashCodeAndEquals() {
|
||||||
|
FeatureDoubleValuesSource valuesSource = new FeatureDoubleValuesSource("test_field", "test_feature");
|
||||||
|
FeatureDoubleValuesSource equal = new FeatureDoubleValuesSource("test_field", "test_feature");
|
||||||
|
|
||||||
|
FeatureDoubleValuesSource differentField = new FeatureDoubleValuesSource("other field", "test_feature");
|
||||||
|
FeatureDoubleValuesSource differentFeature = new FeatureDoubleValuesSource("test_field", "other_feature");
|
||||||
|
DoubleValuesSource otherImpl = new DoubleValuesSource() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isCacheable(LeafReaderContext ctx) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DoubleValuesSource rewrite(IndexSearcher reader) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean needsScores() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertTrue(valuesSource.equals(equal));
|
||||||
|
assertEquals(valuesSource.hashCode(), equal.hashCode());
|
||||||
|
assertFalse(valuesSource.equals(null));
|
||||||
|
assertFalse(valuesSource.equals(otherImpl));
|
||||||
|
assertNotEquals(valuesSource.hashCode(), otherImpl.hashCode());
|
||||||
|
assertFalse(valuesSource.equals(differentField));
|
||||||
|
assertNotEquals(valuesSource.hashCode(), differentField.hashCode());
|
||||||
|
assertFalse(valuesSource.equals(differentFeature));
|
||||||
|
assertNotEquals(valuesSource.hashCode(), differentFeature.hashCode());
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue