LUCENE-7611: Suggester uses LongValuesSource in place of ValueSource

This commit is contained in:
Alan Woodward 2017-01-06 11:03:09 +00:00
parent 713b65d1dc
commit 1a95c5acd0
5 changed files with 352 additions and 20 deletions

View File

@ -82,6 +82,9 @@ API Changes
* LUCENE-7610: The facets module now uses the DoubleValuesSource API, and
methods that take ValueSource parameters are deprecated (Alan Woodward)
* LUCENE-7611: DocumentValueSourceDictionary now takes a LongValuesSource
as a parameter, and the ValueSource equivalent is deprecated (Alan Woodward)
New features
* LUCENE-5867: Added BooleanSimilarity. (Robert Muir, Adrien Grand)

View File

@ -74,6 +74,33 @@ public abstract class LongValuesSource {
return fromLongField(field);
}
/**
* Creates a LongValuesSource that always returns a constant value
*/
public static LongValuesSource constant(long value) {
return new LongValuesSource() {
@Override
public LongValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
return new LongValues() {
@Override
public long longValue() throws IOException {
return value;
}
@Override
public boolean advanceExact(int doc) throws IOException {
return true;
}
};
}
@Override
public boolean needsScores() {
return false;
}
};
}
private static class FieldValuesSource extends LongValuesSource {
final String field;

View File

@ -17,15 +17,15 @@
package org.apache.lucene.search.suggest;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.LongValues;
import org.apache.lucene.search.LongValuesSource;
/**
@ -34,7 +34,7 @@ import org.apache.lucene.queries.function.ValueSource;
* optionally contexts information
* taken from stored fields in a Lucene index. Similar to
* {@link DocumentDictionary}, except it obtains the weight
* of the terms in a document based on a {@link ValueSource}.
* of the terms in a document based on a {@link LongValuesSource}.
* </p>
* <b>NOTE:</b>
* <ul>
@ -46,44 +46,75 @@ import org.apache.lucene.queries.function.ValueSource;
* </li>
* </ul>
* <p>
* In practice the {@link ValueSource} will likely be obtained
* In practice the {@link LongValuesSource} will likely be obtained
* using the lucene expression module. The following example shows
* how to create a {@link ValueSource} from a simple addition of two
* how to create a {@link LongValuesSource} from a simple addition of two
* fields:
* <code>
* Expression expression = JavascriptCompiler.compile("f1 + f2");
* SimpleBindings bindings = new SimpleBindings();
* bindings.add(new SortField("f1", SortField.Type.LONG));
* bindings.add(new SortField("f2", SortField.Type.LONG));
* ValueSource valueSource = expression.getValueSource(bindings);
* LongValuesSource valueSource = expression.getDoubleValuesSource(bindings).toLongValuesSource();
* </code>
* </p>
*
*/
public class DocumentValueSourceDictionary extends DocumentDictionary {
private final ValueSource weightsValueSource;
private final LongValuesSource weightsValueSource;
/**
* Creates a new dictionary with the contents of the fields named <code>field</code>
* for the terms, <code>payload</code> for the corresponding payloads, <code>contexts</code>
* for the associated contexts and uses the <code>weightsValueSource</code> supplied
* to determine the score.
*
* @deprecated Use {@link #DocumentValueSourceDictionary(IndexReader, String, LongValuesSource, String, String)}
*/
@Deprecated
public DocumentValueSourceDictionary(IndexReader reader, String field,
ValueSource weightsValueSource, String payload, String contexts) {
super(reader, field, null, payload, contexts);
this.weightsValueSource = weightsValueSource.asLongValuesSource();
}
/**
* Creates a new dictionary with the contents of the fields named <code>field</code>
* for the terms, <code>payload</code> for the corresponding payloads, <code>contexts</code>
* for the associated contexts and uses the <code>weightsValueSource</code> supplied
* to determine the score.
*/
public DocumentValueSourceDictionary(IndexReader reader, String field,
LongValuesSource weightsValueSource, String payload, String contexts) {
super(reader, field, null, payload, contexts);
this.weightsValueSource = weightsValueSource;
}
/**
* Creates a new dictionary with the contents of the fields named <code>field</code>
* for the terms, <code>payloadField</code> for the corresponding payloads
* and uses the <code>weightsValueSource</code> supplied to determine the
* score.
*
* @deprecated Use {@link #DocumentValueSourceDictionary(IndexReader, String, LongValuesSource, String)}
*/
@Deprecated
public DocumentValueSourceDictionary(IndexReader reader, String field,
ValueSource weightsValueSource, String payload) {
super(reader, field, null, payload);
this.weightsValueSource = weightsValueSource.asLongValuesSource();
}
/**
* Creates a new dictionary with the contents of the fields named <code>field</code>
* for the terms, <code>payloadField</code> for the corresponding payloads
* and uses the <code>weightsValueSource</code> supplied to determine the
* score.
*/
public DocumentValueSourceDictionary(IndexReader reader, String field,
LongValuesSource weightsValueSource, String payload) {
super(reader, field, null, payload);
this.weightsValueSource = weightsValueSource;
}
@ -91,11 +122,25 @@ public class DocumentValueSourceDictionary extends DocumentDictionary {
* Creates a new dictionary with the contents of the fields named <code>field</code>
* for the terms and uses the <code>weightsValueSource</code> supplied to determine the
* score.
*
* @deprecated Use {@link #DocumentValueSourceDictionary(IndexReader, String, LongValuesSource)}
*/
@Deprecated
public DocumentValueSourceDictionary(IndexReader reader, String field,
ValueSource weightsValueSource) {
super(reader, field, null, null);
this.weightsValueSource = weightsValueSource;
this.weightsValueSource = weightsValueSource.asLongValuesSource();
}
/**
* Creates a new dictionary with the contents of the fields named <code>field</code>
* for the terms and uses the <code>weightsValueSource</code> supplied to determine the
* score.
*/
public DocumentValueSourceDictionary(IndexReader reader, String field,
LongValuesSource weightsValueSource) {
super(reader, field, null, null);
this.weightsValueSource = weightsValueSource;
}
@Override
@ -105,7 +150,7 @@ public class DocumentValueSourceDictionary extends DocumentDictionary {
final class DocumentValueSourceInputIterator extends DocumentDictionary.DocumentInputIterator {
private FunctionValues currentWeightValues;
private LongValues currentWeightValues;
/** leaves of the reader */
private final List<LeafReaderContext> leaves;
/** starting docIds of all the leaves */
@ -123,7 +168,7 @@ public class DocumentValueSourceDictionary extends DocumentDictionary {
}
starts[leaves.size()] = reader.maxDoc();
currentWeightValues = (leaves.size() > 0)
? weightsValueSource.getValues(new HashMap<String, Object>(), leaves.get(currentLeafIndex))
? weightsValueSource.getValues(leaves.get(currentLeafIndex), null)
: null;
}
@ -140,13 +185,16 @@ public class DocumentValueSourceDictionary extends DocumentDictionary {
if (subIndex != currentLeafIndex) {
currentLeafIndex = subIndex;
try {
currentWeightValues = weightsValueSource.getValues(new HashMap<String, Object>(), leaves.get(currentLeafIndex));
currentWeightValues = weightsValueSource.getValues(leaves.get(currentLeafIndex), null);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
try {
return currentWeightValues.longVal(docId - starts[subIndex]);
if (currentWeightValues.advanceExact(docId - starts[subIndex]))
return currentWeightValues.longValue();
else
return 0;
} catch (IOException e) {
throw new RuntimeException(e);
}

View File

@ -36,12 +36,16 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.DoubleConstValueSource;
import org.apache.lucene.queries.function.valuesource.LongFieldSource;
import org.apache.lucene.queries.function.valuesource.SumFloatFunction;
import org.apache.lucene.search.DoubleValues;
import org.apache.lucene.search.LongValues;
import org.apache.lucene.search.LongValuesSource;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
@ -59,7 +63,7 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
static final String CONTEXTS_FIELD_NAME = "c1";
@Test
public void testEmptyReader() throws IOException {
public void testValueSourceEmptyReader() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
@ -78,9 +82,30 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testLongValuesSourceEmptyReader() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
// Make sure the index is created?
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
writer.commit();
writer.close();
IndexReader ir = DirectoryReader.open(dir);
Dictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, LongValuesSource.constant(10), PAYLOAD_FIELD_NAME);
InputIterator inputIterator = dictionary.getEntryIterator();
assertNull(inputIterator.next());
assertEquals(inputIterator.weight(), 0);
assertNull(inputIterator.payload());
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testBasic() throws IOException {
public void testValueSourceBasic() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
@ -112,9 +137,83 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
}
private static LongValuesSource sum(String... fields) {
LongValuesSource[] sources = new LongValuesSource[fields.length];
for (int i = 0; i < fields.length; i++) {
sources[i] = LongValuesSource.fromLongField(fields[i]);
}
return new LongValuesSource() {
@Override
public LongValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
LongValues[] values = new LongValues[fields.length];
for (int i = 0; i < sources.length; i++) {
values[i] = sources[i].getValues(ctx, scores);
}
return new LongValues() {
@Override
public long longValue() throws IOException {
long v = 0;
for (LongValues value : values) {
v += value.longValue();
}
return v;
}
@Override
public boolean advanceExact(int doc) throws IOException {
boolean v = true;
for (LongValues value : values) {
v &= value.advanceExact(doc);
}
return v;
}
};
}
@Override
public boolean needsScores() {
return false;
}
};
}
@Test
public void testLongValuesSourceBasic() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Map<String, Document> docs = generateIndexDocuments(atLeast(100));
for(Document doc: docs.values()) {
writer.addDocument(doc);
}
writer.commit();
writer.close();
IndexReader ir = DirectoryReader.open(dir);
LongValuesSource sumValueSource = sum(WEIGHT_FIELD_NAME_1, WEIGHT_FIELD_NAME_2, WEIGHT_FIELD_NAME_3);
Dictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, sumValueSource, PAYLOAD_FIELD_NAME);
InputIterator inputIterator = dictionary.getEntryIterator();
BytesRef f;
while((f = inputIterator.next())!=null) {
Document doc = docs.remove(f.utf8ToString());
long w1 = doc.getField(WEIGHT_FIELD_NAME_1).numericValue().longValue();
long w2 = doc.getField(WEIGHT_FIELD_NAME_2).numericValue().longValue();
long w3 = doc.getField(WEIGHT_FIELD_NAME_3).numericValue().longValue();
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
assertEquals(inputIterator.weight(), (w1 + w2 + w3));
IndexableField payloadField = doc.getField(PAYLOAD_FIELD_NAME);
if (payloadField == null) assertTrue(inputIterator.payload().length == 0);
else assertEquals(inputIterator.payload(), payloadField.binaryValue());
}
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testWithContext() throws IOException {
public void testValueSourceWithContext() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
@ -153,7 +252,46 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
}
@Test
public void testWithoutPayload() throws IOException {
public void testLongValuesSourceWithContext() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Map<String, Document> docs = generateIndexDocuments(atLeast(100));
for(Document doc: docs.values()) {
writer.addDocument(doc);
}
writer.commit();
writer.close();
IndexReader ir = DirectoryReader.open(dir);
LongValuesSource sumValues = sum(WEIGHT_FIELD_NAME_1, WEIGHT_FIELD_NAME_2, WEIGHT_FIELD_NAME_3);
Dictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, sumValues, PAYLOAD_FIELD_NAME, CONTEXTS_FIELD_NAME);
InputIterator inputIterator = dictionary.getEntryIterator();
BytesRef f;
while((f = inputIterator.next())!=null) {
Document doc = docs.remove(f.utf8ToString());
long w1 = doc.getField(WEIGHT_FIELD_NAME_1).numericValue().longValue();
long w2 = doc.getField(WEIGHT_FIELD_NAME_2).numericValue().longValue();
long w3 = doc.getField(WEIGHT_FIELD_NAME_3).numericValue().longValue();
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
assertEquals(inputIterator.weight(), (w1 + w2 + w3));
IndexableField payloadField = doc.getField(PAYLOAD_FIELD_NAME);
if (payloadField == null) assertTrue(inputIterator.payload().length == 0);
else assertEquals(inputIterator.payload(), payloadField.binaryValue());
Set<BytesRef> originalCtxs = new HashSet<>();
for (IndexableField ctxf: doc.getFields(CONTEXTS_FIELD_NAME)) {
originalCtxs.add(ctxf.binaryValue());
}
assertEquals(originalCtxs, inputIterator.contexts());
}
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testValueSourceWithoutPayload() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
@ -183,9 +321,41 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testLongValuesSourceWithoutPayload() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Map<String, Document> docs = generateIndexDocuments(atLeast(100));
for(Document doc: docs.values()) {
writer.addDocument(doc);
}
writer.commit();
writer.close();
IndexReader ir = DirectoryReader.open(dir);
LongValuesSource sumValues = sum(WEIGHT_FIELD_NAME_1, WEIGHT_FIELD_NAME_2, WEIGHT_FIELD_NAME_3);
Dictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, sumValues);
InputIterator inputIterator = dictionary.getEntryIterator();
BytesRef f;
while((f = inputIterator.next())!=null) {
Document doc = docs.remove(f.utf8ToString());
long w1 = doc.getField(WEIGHT_FIELD_NAME_1).numericValue().longValue();
long w2 = doc.getField(WEIGHT_FIELD_NAME_2).numericValue().longValue();
long w3 = doc.getField(WEIGHT_FIELD_NAME_3).numericValue().longValue();
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
assertEquals(inputIterator.weight(), (w1 + w2 + w3));
assertNull(inputIterator.payload());
}
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testWithDeletions() throws IOException {
public void testValueSourceWithDeletions() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
@ -238,6 +408,60 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testLongValuesSourceWithDeletions() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Map<String, Document> docs = generateIndexDocuments(atLeast(100));
Random rand = random();
List<String> termsToDel = new ArrayList<>();
for(Document doc : docs.values()) {
if(rand.nextBoolean() && termsToDel.size() < docs.size()-1) {
termsToDel.add(doc.get(FIELD_NAME));
}
writer.addDocument(doc);
}
writer.commit();
Term[] delTerms = new Term[termsToDel.size()];
for(int i=0; i < termsToDel.size() ; i++) {
delTerms[i] = new Term(FIELD_NAME, termsToDel.get(i));
}
for(Term delTerm: delTerms) {
writer.deleteDocuments(delTerm);
}
writer.commit();
writer.close();
for(String termToDel: termsToDel) {
assertTrue(null!=docs.remove(termToDel));
}
IndexReader ir = DirectoryReader.open(dir);
assertTrue("NumDocs should be > 0 but was " + ir.numDocs(), ir.numDocs() > 0);
assertEquals(ir.numDocs(), docs.size());
LongValuesSource sumValues = sum(WEIGHT_FIELD_NAME_1, WEIGHT_FIELD_NAME_2);
Dictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, sumValues, PAYLOAD_FIELD_NAME);
InputIterator inputIterator = dictionary.getEntryIterator();
BytesRef f;
while((f = inputIterator.next())!=null) {
Document doc = docs.remove(f.utf8ToString());
long w1 = doc.getField(WEIGHT_FIELD_NAME_1).numericValue().longValue();
long w2 = doc.getField(WEIGHT_FIELD_NAME_2).numericValue().longValue();
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
assertEquals(inputIterator.weight(), w2+w1);
IndexableField payloadField = doc.getField(PAYLOAD_FIELD_NAME);
if (payloadField == null) assertTrue(inputIterator.payload().length == 0);
else assertEquals(inputIterator.payload(), payloadField.binaryValue());
}
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testWithValueSource() throws IOException {
@ -269,6 +493,36 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testWithLongValuesSource() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Map<String, Document> docs = generateIndexDocuments(atLeast(100));
for(Document doc: docs.values()) {
writer.addDocument(doc);
}
writer.commit();
writer.close();
IndexReader ir = DirectoryReader.open(dir);
Dictionary dictionary = new DocumentValueSourceDictionary(ir, FIELD_NAME, LongValuesSource.constant(10), PAYLOAD_FIELD_NAME);
InputIterator inputIterator = dictionary.getEntryIterator();
BytesRef f;
while((f = inputIterator.next())!=null) {
Document doc = docs.remove(f.utf8ToString());
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
assertEquals(inputIterator.weight(), 10);
IndexableField payloadField = doc.getField(PAYLOAD_FIELD_NAME);
if (payloadField == null) assertTrue(inputIterator.payload().length == 0);
else assertEquals(inputIterator.payload(), payloadField.binaryValue());
}
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
}
private Map<String, Document> generateIndexDocuments(int ndocs) {
Map<String, Document> docs = new HashMap<>();
for(int i = 0; i < ndocs ; i++) {

View File

@ -23,7 +23,7 @@ import java.util.Set;
import org.apache.lucene.expressions.Expression;
import org.apache.lucene.expressions.SimpleBindings;
import org.apache.lucene.expressions.js.JavascriptCompiler;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.LongValuesSource;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.suggest.DocumentValueSourceDictionary;
@ -93,7 +93,7 @@ public class DocumentExpressionDictionaryFactory extends DictionaryFactory {
sortFields), payloadField);
}
public ValueSource fromExpression(String weightExpression, Set<SortField> sortFields) {
public LongValuesSource fromExpression(String weightExpression, Set<SortField> sortFields) {
Expression expression = null;
try {
expression = JavascriptCompiler.compile(weightExpression);
@ -104,7 +104,7 @@ public class DocumentExpressionDictionaryFactory extends DictionaryFactory {
for (SortField sortField : sortFields) {
bindings.add(sortField);
}
return expression.getValueSource(bindings);
return expression.getDoubleValuesSource(bindings).toLongValuesSource();
}
private SortField.Type getSortFieldType(SolrCore core, String sortFieldName) {