mirror of https://github.com/apache/lucene.git
LUCENE-8803: Provide a FieldComparator to allow sorting by a feature from a FeatureField (#680)
This change adds a SortField which allows a convenient way to sort search hits using a feature from a FeatureField.
This commit is contained in:
parent
493364d4b1
commit
39c8cca177
|
@ -5,6 +5,12 @@ http://s.apache.org/luceneversions
|
|||
|
||||
======================= Lucene 8.2.0 =======================
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-8803: Provide a FeatureSortfield to allow sorting search hits by descending value of a
|
||||
feature. This is exposed via the factory method FeatureField#newFeatureSort.
|
||||
(Colin Goodheart-Smithe via Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-8785: Ensure new threadstates are locked before retrieving the number of active threadstates.
|
||||
|
|
|
@ -30,7 +30,9 @@ import org.apache.lucene.index.TermStates;
|
|||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.BoostQuery;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.FieldDoc;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
|
||||
|
@ -196,7 +198,7 @@ public final class FeatureField extends Field {
|
|||
|
||||
private static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15;
|
||||
|
||||
private static float decodeFeatureValue(float freq) {
|
||||
static float decodeFeatureValue(float freq) {
|
||||
if (freq > MAX_FREQ) {
|
||||
// This is never used in practice but callers of the SimScorer API might
|
||||
// occasionally call it on eg. Float.MAX_VALUE to compute the max score
|
||||
|
@ -518,4 +520,22 @@ public final class FeatureField extends Field {
|
|||
float avgFreq = (float) ((double) states.totalTermFreq() / states.docFreq());
|
||||
return decodeFeatureValue(avgFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a SortField for sorting by the value of a feature.
|
||||
* <p>
|
||||
* This sort orders documents by descending value of a feature. The value returned in {@link FieldDoc} for
|
||||
* the hits contains a Float instance with the feature value.
|
||||
* <p>
|
||||
* If a document is missing the field, then it is treated as having a vaue of <code>0.0f</code>.
|
||||
* <p>
|
||||
*
|
||||
* @param field field name. Must not be null.
|
||||
* @param featureName feature name. Must not be null.
|
||||
* @return SortField ordering documents by the value of the feature
|
||||
* @throws NullPointerException if {@code field} or {@code featureName} is null.
|
||||
*/
|
||||
public static SortField newFeatureSort(String field, String featureName) {
|
||||
return new FeatureSortField(field, featureName);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.FieldComparator;
|
||||
import org.apache.lucene.search.SimpleFieldComparator;
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Sorts using the value of a specified feature name from a {@link FeatureField}.
|
||||
*/
|
||||
final class FeatureSortField extends SortField {
|
||||
|
||||
private final String featureName;
|
||||
|
||||
/**
|
||||
* Creates a {@link FeatureSortField} that can be used to sort hits by
|
||||
* the value of a particular feature in a {@link FeatureField}.
|
||||
*
|
||||
* @param featureName The name of the feature to use for the sort value
|
||||
*/
|
||||
public FeatureSortField(String field, String featureName) {
|
||||
super(Objects.requireNonNull(field), SortField.Type.CUSTOM);
|
||||
this.featureName = Objects.requireNonNull(featureName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldComparator<?> getComparator(int numHits, int sortPos) {
|
||||
return new FeatureComparator(numHits, getField(), featureName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setMissingValue(Object missingValue) {
|
||||
throw new IllegalArgumentException("Missing value not supported for FeatureSortField");
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = super.hashCode();
|
||||
result = prime * result + featureName.hashCode();
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (!super.equals(obj)) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
FeatureSortField other = (FeatureSortField) obj;
|
||||
return Objects.equals(featureName, other.featureName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("<feature:");
|
||||
builder.append('"');
|
||||
builder.append(getField());
|
||||
builder.append('"');
|
||||
builder.append(" featureName=");
|
||||
builder.append(featureName);
|
||||
builder.append('>');
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
/** Parses a feature field's values as float and sorts by descending value */
|
||||
class FeatureComparator extends SimpleFieldComparator<Float> {
|
||||
private final String field;
|
||||
private final BytesRef featureName;
|
||||
private final float[] values;
|
||||
private float bottom;
|
||||
private float topValue;
|
||||
private PostingsEnum currentReaderPostingsValues;
|
||||
|
||||
/** Creates a new comparator based on relevance for {@code numHits}. */
|
||||
public FeatureComparator(int numHits, String field, String featureName) {
|
||||
this.values = new float[numHits];
|
||||
this.field = field;
|
||||
this.featureName = new BytesRef(featureName);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doSetNextReader(LeafReaderContext context) throws IOException {
|
||||
Terms terms = context.reader().terms(field);
|
||||
if (terms == null) {
|
||||
currentReaderPostingsValues = null;
|
||||
} else {
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
if (termsEnum.seekExact(featureName) == false) {
|
||||
currentReaderPostingsValues = null;
|
||||
} else {
|
||||
currentReaderPostingsValues = termsEnum.postings(currentReaderPostingsValues, PostingsEnum.FREQS);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private float getValueForDoc(int doc) throws IOException {
|
||||
if (currentReaderPostingsValues != null && doc >= currentReaderPostingsValues.docID()
|
||||
&& (currentReaderPostingsValues.docID() == doc || currentReaderPostingsValues.advance(doc) == doc)) {
|
||||
return FeatureField.decodeFeatureValue(currentReaderPostingsValues.freq());
|
||||
} else {
|
||||
return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(int slot1, int slot2) {
|
||||
return Float.compare(values[slot2], values[slot1]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareBottom(int doc) throws IOException {
|
||||
return Float.compare(getValueForDoc(doc), bottom);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copy(int slot, int doc) throws IOException {
|
||||
values[slot] = getValueForDoc(doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setBottom(final int bottom) {
|
||||
this.bottom = values[bottom];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setTopValue(Float value) {
|
||||
topValue = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Float value(int slot) {
|
||||
return Float.valueOf(values[slot]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTop(int doc) throws IOException {
|
||||
return Float.compare(getValueForDoc(doc), topValue);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,213 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.document;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/*
|
||||
* Test for sorting using a feature from a FeatureField.
|
||||
*
|
||||
* THE RULES:
|
||||
* 1. keywords like 'abstract' and 'static' should not appear in this file.
|
||||
* 2. each test method should be self-contained and understandable.
|
||||
* 3. no test methods should share code with other test methods.
|
||||
* 4. no testing of things unrelated to sorting.
|
||||
* 5. no tracers.
|
||||
* 6. keyword 'class' should appear only once in this file, here ----
|
||||
* |
|
||||
* -----------------------------------------------------------
|
||||
* |
|
||||
* \./
|
||||
*/
|
||||
public class TestFeatureSort extends LuceneTestCase {
|
||||
|
||||
public void testFeature() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 30.1F));
|
||||
doc.add(newStringField("value", "30.1", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 1.3F));
|
||||
doc.add(newStringField("value", "1.3", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 4.2F));
|
||||
doc.add(newStringField("value", "4.2", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
IndexReader ir = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
Sort sort = new Sort(FeatureField.newFeatureSort("field", "name"));
|
||||
|
||||
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
|
||||
assertEquals(3, td.totalHits.value);
|
||||
// numeric order
|
||||
assertEquals("30.1", searcher.doc(td.scoreDocs[0].doc).get("value"));
|
||||
assertEquals("4.2", searcher.doc(td.scoreDocs[1].doc).get("value"));
|
||||
assertEquals("1.3", searcher.doc(td.scoreDocs[2].doc).get("value"));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testFeatureMissing() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 1.3F));
|
||||
doc.add(newStringField("value", "1.3", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 4.2F));
|
||||
doc.add(newStringField("value", "4.2", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
IndexReader ir = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
Sort sort = new Sort(FeatureField.newFeatureSort("field", "name"));
|
||||
|
||||
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
|
||||
assertEquals(3, td.totalHits.value);
|
||||
// null is treated as 0
|
||||
assertEquals("4.2", searcher.doc(td.scoreDocs[0].doc).get("value"));
|
||||
assertEquals("1.3", searcher.doc(td.scoreDocs[1].doc).get("value"));
|
||||
assertNull(searcher.doc(td.scoreDocs[2].doc).get("value"));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testFeatureMissingFieldInSegment() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
writer.addDocument(doc);
|
||||
writer.commit();
|
||||
doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 1.3F));
|
||||
doc.add(newStringField("value", "1.3", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 4.2F));
|
||||
doc.add(newStringField("value", "4.2", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
IndexReader ir = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
Sort sort = new Sort(FeatureField.newFeatureSort("field", "name"));
|
||||
|
||||
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
|
||||
assertEquals(3, td.totalHits.value);
|
||||
// null is treated as 0
|
||||
assertEquals("4.2", searcher.doc(td.scoreDocs[0].doc).get("value"));
|
||||
assertEquals("1.3", searcher.doc(td.scoreDocs[1].doc).get("value"));
|
||||
assertNull(searcher.doc(td.scoreDocs[2].doc).get("value"));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testFeatureMissingFeatureNameInSegment() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
doc.add(new FeatureField("field", "different_name", 0.5F));
|
||||
writer.addDocument(doc);
|
||||
writer.commit();
|
||||
doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 1.3F));
|
||||
doc.add(newStringField("value", "1.3", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 4.2F));
|
||||
doc.add(newStringField("value", "4.2", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
IndexReader ir = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
Sort sort = new Sort(FeatureField.newFeatureSort("field", "name"));
|
||||
|
||||
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
|
||||
assertEquals(3, td.totalHits.value);
|
||||
// null is treated as 0
|
||||
assertEquals("4.2", searcher.doc(td.scoreDocs[0].doc).get("value"));
|
||||
assertEquals("1.3", searcher.doc(td.scoreDocs[1].doc).get("value"));
|
||||
assertNull(searcher.doc(td.scoreDocs[2].doc).get("value"));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testFeatureMultipleMissing() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 1.3F));
|
||||
doc.add(newStringField("value", "1.3", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(new FeatureField("field", "name", 4.2F));
|
||||
doc.add(newStringField("value", "4.2", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
IndexReader ir = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(ir);
|
||||
Sort sort = new Sort(FeatureField.newFeatureSort("field", "name"));
|
||||
|
||||
TopDocs td = searcher.search(new MatchAllDocsQuery(), 10, sort);
|
||||
assertEquals(7, td.totalHits.value);
|
||||
// null is treated as 0
|
||||
assertEquals("4.2", searcher.doc(td.scoreDocs[0].doc).get("value"));
|
||||
assertEquals("1.3", searcher.doc(td.scoreDocs[1].doc).get("value"));
|
||||
assertNull(searcher.doc(td.scoreDocs[2].doc).get("value"));
|
||||
assertNull(searcher.doc(td.scoreDocs[3].doc).get("value"));
|
||||
assertNull(searcher.doc(td.scoreDocs[4].doc).get("value"));
|
||||
assertNull(searcher.doc(td.scoreDocs[5].doc).get("value"));
|
||||
assertNull(searcher.doc(td.scoreDocs[6].doc).get("value"));
|
||||
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue