LUCENE-8803: Provide a FieldComparator to allow sorting by a feature from a FeatureField (#680)

This change adds a SortField which allows a convenient way to sort search hits using a feature from a FeatureField.
This commit is contained in:
Colin Goodheart-Smithe 2019-05-24 07:45:57 +01:00 committed by Adrien Grand
parent 493364d4b1
commit 39c8cca177
4 changed files with 404 additions and 1 deletions

View File

@ -5,6 +5,12 @@ http://s.apache.org/luceneversions
======================= Lucene 8.2.0 =======================
New Features
* LUCENE-8803: Provide a FeatureSortfield to allow sorting search hits by descending value of a
feature. This is exposed via the factory method FeatureField#newFeatureSort.
(Colin Goodheart-Smithe via Adrien Grand)
Bug Fixes
* LUCENE-8785: Ensure new threadstates are locked before retrieving the number of active threadstates.

View File

@ -30,7 +30,9 @@ import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
@ -196,7 +198,7 @@ public final class FeatureField extends Field {
private static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15;
private static float decodeFeatureValue(float freq) {
static float decodeFeatureValue(float freq) {
if (freq > MAX_FREQ) {
// This is never used in practice but callers of the SimScorer API might
// occasionally call it on eg. Float.MAX_VALUE to compute the max score
@ -518,4 +520,22 @@ public final class FeatureField extends Field {
float avgFreq = (float) ((double) states.totalTermFreq() / states.docFreq());
return decodeFeatureValue(avgFreq);
}
/**
* Creates a SortField for sorting by the value of a feature.
* <p>
* This sort orders documents by descending value of a feature. The value returned in {@link FieldDoc} for
* the hits contains a Float instance with the feature value.
* <p>
* If a document is missing the field, then it is treated as having a vaue of <code>0.0f</code>.
* <p>
*
* @param field field name. Must not be null.
* @param featureName feature name. Must not be null.
* @return SortField ordering documents by the value of the feature
* @throws NullPointerException if {@code field} or {@code featureName} is null.
*/
public static SortField newFeatureSort(String field, String featureName) {
return new FeatureSortField(field, featureName);
}
}

View File

@ -0,0 +1,164 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;
import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.SimpleFieldComparator;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
/**
* Sorts using the value of a specified feature name from a {@link FeatureField}.
*/
final class FeatureSortField extends SortField {
private final String featureName;
/**
* Creates a {@link FeatureSortField} that can be used to sort hits by
* the value of a particular feature in a {@link FeatureField}.
*
* @param featureName The name of the feature to use for the sort value
*/
public FeatureSortField(String field, String featureName) {
super(Objects.requireNonNull(field), SortField.Type.CUSTOM);
this.featureName = Objects.requireNonNull(featureName);
}
@Override
public FieldComparator<?> getComparator(int numHits, int sortPos) {
return new FeatureComparator(numHits, getField(), featureName);
}
@Override
public void setMissingValue(Object missingValue) {
throw new IllegalArgumentException("Missing value not supported for FeatureSortField");
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + featureName.hashCode();
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (!super.equals(obj)) return false;
if (getClass() != obj.getClass()) return false;
FeatureSortField other = (FeatureSortField) obj;
return Objects.equals(featureName, other.featureName);
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("<feature:");
builder.append('"');
builder.append(getField());
builder.append('"');
builder.append(" featureName=");
builder.append(featureName);
builder.append('>');
return builder.toString();
}
/** Parses a feature field's values as float and sorts by descending value */
class FeatureComparator extends SimpleFieldComparator<Float> {
private final String field;
private final BytesRef featureName;
private final float[] values;
private float bottom;
private float topValue;
private PostingsEnum currentReaderPostingsValues;
/** Creates a new comparator based on relevance for {@code numHits}. */
public FeatureComparator(int numHits, String field, String featureName) {
this.values = new float[numHits];
this.field = field;
this.featureName = new BytesRef(featureName);
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
Terms terms = context.reader().terms(field);
if (terms == null) {
currentReaderPostingsValues = null;
} else {
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(featureName) == false) {
currentReaderPostingsValues = null;
} else {
currentReaderPostingsValues = termsEnum.postings(currentReaderPostingsValues, PostingsEnum.FREQS);
}
}
}
private float getValueForDoc(int doc) throws IOException {
if (currentReaderPostingsValues != null && doc >= currentReaderPostingsValues.docID()
&& (currentReaderPostingsValues.docID() == doc || currentReaderPostingsValues.advance(doc) == doc)) {
return FeatureField.decodeFeatureValue(currentReaderPostingsValues.freq());
} else {
return 0.0f;
}
}
@Override
public int compare(int slot1, int slot2) {
return Float.compare(values[slot2], values[slot1]);
}
@Override
public int compareBottom(int doc) throws IOException {
return Float.compare(getValueForDoc(doc), bottom);
}
@Override
public void copy(int slot, int doc) throws IOException {
values[slot] = getValueForDoc(doc);
}
@Override
public void setBottom(final int bottom) {
this.bottom = values[bottom];
}
@Override
public void setTopValue(Float value) {
topValue = value;
}
@Override
public Float value(int slot) {
return Float.valueOf(values[slot]);
}
@Override
public int compareTop(int doc) throws IOException {
return Float.compare(getValueForDoc(doc), topValue);
}
}
}

View File

@ -0,0 +1,213 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.document;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
/*
* Test for sorting using a feature from a FeatureField.
*
* THE RULES:
* 1. keywords like 'abstract' and 'static' should not appear in this file.
* 2. each test method should be self-contained and understandable.
* 3. no test methods should share code with other test methods.
* 4. no testing of things unrelated to sorting.
* 5. no tracers.
* 6. keyword 'class' should appear only once in this file, here ----
* |
* -----------------------------------------------------------
* |
* \./
*/
public class TestFeatureSort extends LuceneTestCase {
public void testFeature() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new FeatureField("field", "name", 30.1F));
doc.add(newStringField("value", "30.1", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(new FeatureField("field", "name", 1.3F));
doc.add(newStringField("value", "1.3", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(new FeatureField("field", "name", 4.2F));
doc.add(newStringField("value", "4.2", Field.Store.YES));
writer.addDocument(doc);
IndexReader ir = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(FeatureField.newFeatureSort("field", "name"));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(3, td.totalHits.value);
// numeric order
assertEquals("30.1", searcher.doc(td.scoreDocs[0].doc).get("value"));
assertEquals("4.2", searcher.doc(td.scoreDocs[1].doc).get("value"));
assertEquals("1.3", searcher.doc(td.scoreDocs[2].doc).get("value"));
ir.close();
dir.close();
}
public void testFeatureMissing() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
writer.addDocument(doc);
doc = new Document();
doc.add(new FeatureField("field", "name", 1.3F));
doc.add(newStringField("value", "1.3", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(new FeatureField("field", "name", 4.2F));
doc.add(newStringField("value", "4.2", Field.Store.YES));
writer.addDocument(doc);
IndexReader ir = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(FeatureField.newFeatureSort("field", "name"));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(3, td.totalHits.value);
// null is treated as 0
assertEquals("4.2", searcher.doc(td.scoreDocs[0].doc).get("value"));
assertEquals("1.3", searcher.doc(td.scoreDocs[1].doc).get("value"));
assertNull(searcher.doc(td.scoreDocs[2].doc).get("value"));
ir.close();
dir.close();
}
public void testFeatureMissingFieldInSegment() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
writer.addDocument(doc);
writer.commit();
doc = new Document();
doc.add(new FeatureField("field", "name", 1.3F));
doc.add(newStringField("value", "1.3", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(new FeatureField("field", "name", 4.2F));
doc.add(newStringField("value", "4.2", Field.Store.YES));
writer.addDocument(doc);
IndexReader ir = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(FeatureField.newFeatureSort("field", "name"));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(3, td.totalHits.value);
// null is treated as 0
assertEquals("4.2", searcher.doc(td.scoreDocs[0].doc).get("value"));
assertEquals("1.3", searcher.doc(td.scoreDocs[1].doc).get("value"));
assertNull(searcher.doc(td.scoreDocs[2].doc).get("value"));
ir.close();
dir.close();
}
public void testFeatureMissingFeatureNameInSegment() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new FeatureField("field", "different_name", 0.5F));
writer.addDocument(doc);
writer.commit();
doc = new Document();
doc.add(new FeatureField("field", "name", 1.3F));
doc.add(newStringField("value", "1.3", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(new FeatureField("field", "name", 4.2F));
doc.add(newStringField("value", "4.2", Field.Store.YES));
writer.addDocument(doc);
IndexReader ir = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(FeatureField.newFeatureSort("field", "name"));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(3, td.totalHits.value);
// null is treated as 0
assertEquals("4.2", searcher.doc(td.scoreDocs[0].doc).get("value"));
assertEquals("1.3", searcher.doc(td.scoreDocs[1].doc).get("value"));
assertNull(searcher.doc(td.scoreDocs[2].doc).get("value"));
ir.close();
dir.close();
}
public void testFeatureMultipleMissing() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
writer.addDocument(doc);
doc = new Document();
writer.addDocument(doc);
doc = new Document();
writer.addDocument(doc);
doc = new Document();
writer.addDocument(doc);
doc = new Document();
writer.addDocument(doc);
doc = new Document();
doc.add(new FeatureField("field", "name", 1.3F));
doc.add(newStringField("value", "1.3", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(new FeatureField("field", "name", 4.2F));
doc.add(newStringField("value", "4.2", Field.Store.YES));
writer.addDocument(doc);
IndexReader ir = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(ir);
Sort sort = new Sort(FeatureField.newFeatureSort("field", "name"));
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
assertEquals(7, td.totalHits.value);
// null is treated as 0
assertEquals("4.2", searcher.doc(td.scoreDocs[0].doc).get("value"));
assertEquals("1.3", searcher.doc(td.scoreDocs[1].doc).get("value"));
assertNull(searcher.doc(td.scoreDocs[2].doc).get("value"));
assertNull(searcher.doc(td.scoreDocs[3].doc).get("value"));
assertNull(searcher.doc(td.scoreDocs[4].doc).get("value"));
assertNull(searcher.doc(td.scoreDocs[5].doc).get("value"));
assertNull(searcher.doc(td.scoreDocs[6].doc).get("value"));
ir.close();
dir.close();
}
}