LUCENE-6486: make payloads optional in DocumentDictionary

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1680641 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2015-05-20 17:29:26 +00:00
parent 1a93333d50
commit 3e646430e3
5 changed files with 171 additions and 124 deletions

View File

@ -178,6 +178,10 @@ Bug Fixes
* LUCENE-6483: Ensure core closed listeners are called on the same cache key as
the reader which has been used to register the listener. (Adrien Grand)
* LUCENE-6486 DocumentDictionary iterator no longer skips
documents with no payloads and now returns an empty BytesRef instead
(Marius Grama via Michael McCandless)
API Changes
* LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search.suggest;
*/
import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
@ -41,18 +42,15 @@ import org.apache.lucene.util.BytesRef;
* <b>NOTE:</b>
* <ul>
* <li>
* The term and (optionally) payload fields have to be
* stored
* The term field has to be stored; if it is missing, the document is skipped.
* </li>
* <li>
* The payload and contexts field are optional and are not required to be stored.
* </li>
* <li>
* The weight field can be stored or can be a {@link NumericDocValues}.
* If the weight field is not defined, the value of the weight is <code>0</code>
* </li>
* <li>
* if any of the term or (optionally) payload fields supplied
* do not have a value for a document, then the document is
* skipped by the dictionary
* </li>
* </ul>
*/
public class DocumentDictionary implements Dictionary {
@ -90,7 +88,7 @@ public class DocumentDictionary implements Dictionary {
* Creates a new dictionary with the contents of the fields named <code>field</code>
* for the terms, <code>weightField</code> for the weights that will be used for the
* the corresponding terms, <code>payloadField</code> for the corresponding payloads
* for the entry and <code>contextsFeild</code> for associated contexts.
* for the entry and <code>contextsField</code> for associated contexts.
*/
public DocumentDictionary(IndexReader reader, String field, String weightField, String payloadField, String contextsField) {
this.reader = reader;
@ -167,25 +165,26 @@ public class DocumentDictionary implements Dictionary {
StoredDocument doc = reader.document(currentDocId, relevantFields);
Set<BytesRef> tempContexts = new HashSet<>();
BytesRef tempPayload;
BytesRef tempPayload = null;
if (hasPayloads) {
StorableField payload = doc.getField(payloadField);
if (payload == null) {
continue;
} else if (payload.binaryValue() != null) {
tempPayload = payload.binaryValue();
} else if (payload.stringValue() != null) {
tempPayload = new BytesRef(payload.stringValue());
} else {
continue;
if (payload != null) {
if (payload.binaryValue() != null) {
tempPayload = payload.binaryValue();
} else if (payload.stringValue() != null) {
tempPayload = new BytesRef(payload.stringValue());
}
}
// in case that the iterator has payloads configured, use empty values
// instead of null for payload
if (tempPayload == null) {
tempPayload = new BytesRef();
}
} else {
tempPayload = null;
}
Set<BytesRef> tempContexts;
if (hasContexts) {
tempContexts = new HashSet<>();
final StorableField[] contextFields = doc.getFields(contextsField);
for (StorableField contextField : contextFields) {
if (contextField.binaryValue() != null) {
@ -196,6 +195,8 @@ public class DocumentDictionary implements Dictionary {
continue;
}
}
} else {
tempContexts = Collections.emptySet();
}
currentDocFields = doc.getFields(field);

View File

@ -31,7 +31,8 @@ import org.apache.lucene.queries.function.ValueSource;
/**
* <p>
* Dictionary with terms and optionally payload information
* Dictionary with terms and optionally payload and
* optionally contexts information
* taken from stored fields in a Lucene index. Similar to
* {@link DocumentDictionary}, except it obtains the weight
* of the terms in a document based on a {@link ValueSource}.
@ -39,13 +40,10 @@ import org.apache.lucene.queries.function.ValueSource;
* <b>NOTE:</b>
* <ul>
* <li>
* The term and (optionally) payload fields have to be
* stored
* The term field has to be stored; if it is missing, the document is skipped.
* </li>
* <li>
* if the term or (optionally) payload fields supplied
* do not have a value for a document, then the document is
* rejected by the dictionary
* The payload and contexts field are optional and are not required to be stored.
* </li>
* </ul>
* <p>

View File

@ -55,63 +55,6 @@ public class DocumentDictionaryTest extends LuceneTestCase {
static final String PAYLOAD_FIELD_NAME = "p1";
static final String CONTEXT_FIELD_NAME = "c1";
/** Returns Pair(list of invalid document terms, Map of document term -&gt; document) */
private Map.Entry<List<String>, Map<String, Document>> generateIndexDocuments(int ndocs, boolean requiresPayload, boolean requiresContexts) {
Map<String, Document> docs = new HashMap<>();
List<String> invalidDocTerms = new ArrayList<>();
for(int i = 0; i < ndocs ; i++) {
Document doc = new Document();
boolean invalidDoc = false;
Field field = null;
// usually have valid term field in document
if (usually()) {
field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES);
doc.add(field);
} else {
invalidDoc = true;
}
// even if payload is not required usually have it
if (requiresPayload || usually()) {
// usually have valid payload field in document
if (usually()) {
Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i));
doc.add(payload);
} else if (requiresPayload) {
invalidDoc = true;
}
}
if (requiresContexts || usually()) {
if (usually()) {
for (int j = 0; j < atLeast(2); j++) {
doc.add(new StoredField(CONTEXT_FIELD_NAME, new BytesRef("context_" + i + "_"+ j)));
}
}
// we should allow entries without context
}
// usually have valid weight field in document
if (usually()) {
Field weight = (rarely()) ?
new StoredField(WEIGHT_FIELD_NAME, 100d + i) :
new NumericDocValuesField(WEIGHT_FIELD_NAME, 100 + i);
doc.add(weight);
}
String term = null;
if (invalidDoc) {
term = (field!=null) ? field.stringValue() : "invalid_" + i;
invalidDocTerms.add(term);
} else {
term = field.stringValue();
}
docs.put(term, doc);
}
return new SimpleEntry<>(invalidDocTerms, docs);
}
@Test
public void testEmptyReader() throws IOException {
Directory dir = newDirectory();
@ -140,7 +83,7 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), true, false);
Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), false);
Map<String, Document> docs = res.getValue();
List<String> invalidDocTerms = res.getKey();
for(Document doc: docs.values()) {
@ -157,7 +100,9 @@ public class DocumentDictionaryTest extends LuceneTestCase {
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
Field weightField = doc.getField(WEIGHT_FIELD_NAME);
assertEquals(inputIterator.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0);
assertTrue(inputIterator.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue()));
Field payloadField = doc.getField(PAYLOAD_FIELD_NAME);
if (payloadField == null) assertTrue(inputIterator.payload().length == 0);
else assertEquals(inputIterator.payload(), payloadField.binaryValue());
}
for (String invalidTerm : invalidDocTerms) {
@ -167,6 +112,41 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testWithOptionalPayload() throws IOException {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
// Create a document that is missing the payload field
Document doc = new Document();
Field field = new TextField(FIELD_NAME, "some field", Field.Store.YES);
doc.add(field);
// do not store the payload or the contexts
Field weight = new NumericDocValuesField(WEIGHT_FIELD_NAME, 100);
doc.add(weight);
writer.addDocument(doc);
writer.commit();
writer.close();
IndexReader ir = DirectoryReader.open(dir);
// Even though the payload field is missing, the dictionary iterator should not skip the document
// because the payload field is optional.
Dictionary dictionaryOptionalPayload =
new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME);
InputIterator inputIterator = dictionaryOptionalPayload.getEntryIterator();
BytesRef f = inputIterator.next();
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
Field weightField = doc.getField(WEIGHT_FIELD_NAME);
assertEquals(inputIterator.weight(), weightField.numericValue().longValue());
Field payloadField = doc.getField(PAYLOAD_FIELD_NAME);
assertNull(payloadField);
assertTrue(inputIterator.payload().length == 0);
IOUtils.close(ir, analyzer, dir);
}
@Test
public void testWithoutPayload() throws IOException {
@ -175,7 +155,7 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), false, false);
Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), false);
Map<String, Document> docs = res.getValue();
List<String> invalidDocTerms = res.getKey();
for(Document doc: docs.values()) {
@ -192,7 +172,7 @@ public class DocumentDictionaryTest extends LuceneTestCase {
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
Field weightField = doc.getField(WEIGHT_FIELD_NAME);
assertEquals(inputIterator.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0);
assertEquals(inputIterator.payload(), null);
assertNull(inputIterator.payload());
}
for (String invalidTerm : invalidDocTerms) {
@ -211,7 +191,7 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), true, true);
Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), true);
Map<String, Document> docs = res.getValue();
List<String> invalidDocTerms = res.getKey();
for(Document doc: docs.values()) {
@ -228,7 +208,9 @@ public class DocumentDictionaryTest extends LuceneTestCase {
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
Field weightField = doc.getField(WEIGHT_FIELD_NAME);
assertEquals(inputIterator.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0);
assertTrue(inputIterator.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue()));
Field payloadField = doc.getField(PAYLOAD_FIELD_NAME);
if (payloadField == null) assertTrue(inputIterator.payload().length == 0);
else assertEquals(inputIterator.payload(), payloadField.binaryValue());
Set<BytesRef> oriCtxs = new HashSet<>();
Set<BytesRef> contextSet = inputIterator.contexts();
for (StorableField ctxf : doc.getFields(CONTEXT_FIELD_NAME)) {
@ -252,7 +234,7 @@ public class DocumentDictionaryTest extends LuceneTestCase {
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), false, false);
Map.Entry<List<String>, Map<String, Document>> res = generateIndexDocuments(atLeast(1000), false);
Map<String, Document> docs = res.getValue();
List<String> invalidDocTerms = res.getKey();
Random rand = random();
@ -291,7 +273,7 @@ public class DocumentDictionaryTest extends LuceneTestCase {
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
Field weightField = doc.getField(WEIGHT_FIELD_NAME);
assertEquals(inputIterator.weight(), (weightField != null) ? weightField.numericValue().longValue() : 0);
assertEquals(inputIterator.payload(), null);
assertNull(inputIterator.payload());
}
for (String invalidTerm : invalidDocTerms) {
@ -324,13 +306,65 @@ public class DocumentDictionaryTest extends LuceneTestCase {
assertTrue(f.equals(nextSuggestion.term));
long weight = nextSuggestion.weight;
assertEquals(inputIterator.weight(), (weight != -1) ? weight : 0);
assertTrue(inputIterator.payload().equals(nextSuggestion.payload));
assertEquals(inputIterator.payload(), nextSuggestion.payload);
assertTrue(inputIterator.contexts().equals(nextSuggestion.contexts));
}
assertFalse(suggestionsIter.hasNext());
IOUtils.close(ir, analyzer, dir);
}
/** Returns Pair(list of invalid document terms, Map of document term -&gt; document) */
private Map.Entry<List<String>, Map<String, Document>> generateIndexDocuments(int ndocs, boolean requiresContexts) {
Map<String, Document> docs = new HashMap<>();
List<String> invalidDocTerms = new ArrayList<>();
for(int i = 0; i < ndocs ; i++) {
Document doc = new Document();
boolean invalidDoc = false;
Field field = null;
// usually have valid term field in document
if (usually()) {
field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES);
doc.add(field);
} else {
invalidDoc = true;
}
// even if payload is not required usually have it
if (usually()) {
Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i));
doc.add(payload);
}
if (requiresContexts || usually()) {
if (usually()) {
for (int j = 0; j < atLeast(2); j++) {
doc.add(new StoredField(CONTEXT_FIELD_NAME, new BytesRef("context_" + i + "_"+ j)));
}
}
// we should allow entries without context
}
// usually have valid weight field in document
if (usually()) {
Field weight = (rarely()) ?
new StoredField(WEIGHT_FIELD_NAME, 100d + i) :
new NumericDocValuesField(WEIGHT_FIELD_NAME, 100 + i);
doc.add(weight);
}
String term = null;
if (invalidDoc) {
term = (field!=null) ? field.stringValue() : "invalid_" + i;
invalidDocTerms.add(term);
} else {
term = field.stringValue();
}
docs.put(term, doc);
}
return new SimpleEntry<>(invalidDocTerms, docs);
}
private List<Suggestion> indexMultiValuedDocuments(int numDocs, RandomIndexWriter writer) throws IOException {
List<Suggestion> suggestionList = new ArrayList<>(numDocs);

View File

@ -57,31 +57,6 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
static final String WEIGHT_FIELD_NAME_3 = "w3";
static final String PAYLOAD_FIELD_NAME = "p1";
static final String CONTEXTS_FIELD_NAME = "c1";
private Map<String, Document> generateIndexDocuments(int ndocs) {
Map<String, Document> docs = new HashMap<>();
for(int i = 0; i < ndocs ; i++) {
Field field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES);
Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i));
Field weight1 = new NumericDocValuesField(WEIGHT_FIELD_NAME_1, 10 + i);
Field weight2 = new NumericDocValuesField(WEIGHT_FIELD_NAME_2, 20 + i);
Field weight3 = new NumericDocValuesField(WEIGHT_FIELD_NAME_3, 30 + i);
Field contexts = new StoredField(CONTEXTS_FIELD_NAME, new BytesRef("ctx_" + i + "_0"));
Document doc = new Document();
doc.add(field);
doc.add(payload);
doc.add(weight1);
doc.add(weight2);
doc.add(weight3);
doc.add(contexts);
for(int j = 1; j < atLeast(3); j++) {
contexts.setBytesValue(new BytesRef("ctx_" + i + "_" + j));
doc.add(contexts);
}
docs.put(field.stringValue(), doc);
}
return docs;
}
@Test
public void testEmptyReader() throws IOException {
@ -130,7 +105,9 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
long w3 = doc.getField(WEIGHT_FIELD_NAME_3).numericValue().longValue();
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
assertEquals(inputIterator.weight(), (w1 + w2 + w3));
assertTrue(inputIterator.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue()));
Field payloadField = doc.getField(PAYLOAD_FIELD_NAME);
if (payloadField == null) assertTrue(inputIterator.payload().length == 0);
else assertEquals(inputIterator.payload(), payloadField.binaryValue());
}
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
@ -162,7 +139,9 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
long w3 = doc.getField(WEIGHT_FIELD_NAME_3).numericValue().longValue();
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
assertEquals(inputIterator.weight(), (w1 + w2 + w3));
assertTrue(inputIterator.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue()));
Field payloadField = doc.getField(PAYLOAD_FIELD_NAME);
if (payloadField == null) assertTrue(inputIterator.payload().length == 0);
else assertEquals(inputIterator.payload(), payloadField.binaryValue());
Set<BytesRef> originalCtxs = new HashSet<>();
for (Field ctxf: doc.getFields(CONTEXTS_FIELD_NAME)) {
originalCtxs.add(ctxf.binaryValue());
@ -199,7 +178,7 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
long w3 = doc.getField(WEIGHT_FIELD_NAME_3).numericValue().longValue();
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
assertEquals(inputIterator.weight(), (w1 + w2 + w3));
assertEquals(inputIterator.payload(), null);
assertNull(inputIterator.payload());
}
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
@ -252,7 +231,9 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
long w2 = doc.getField(WEIGHT_FIELD_NAME_2).numericValue().longValue();
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
assertEquals(inputIterator.weight(), w2+w1);
assertTrue(inputIterator.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue()));
Field payloadField = doc.getField(PAYLOAD_FIELD_NAME);
if (payloadField == null) assertTrue(inputIterator.payload().length == 0);
else assertEquals(inputIterator.payload(), payloadField.binaryValue());
}
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
@ -280,10 +261,39 @@ public class DocumentValueSourceDictionaryTest extends LuceneTestCase {
Document doc = docs.remove(f.utf8ToString());
assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
assertEquals(inputIterator.weight(), 10);
assertTrue(inputIterator.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue()));
Field payloadField = doc.getField(PAYLOAD_FIELD_NAME);
if (payloadField == null) assertTrue(inputIterator.payload().length == 0);
else assertEquals(inputIterator.payload(), payloadField.binaryValue());
}
assertTrue(docs.isEmpty());
IOUtils.close(ir, analyzer, dir);
}
private Map<String, Document> generateIndexDocuments(int ndocs) {
Map<String, Document> docs = new HashMap<>();
for(int i = 0; i < ndocs ; i++) {
Field field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES);
Field weight1 = new NumericDocValuesField(WEIGHT_FIELD_NAME_1, 10 + i);
Field weight2 = new NumericDocValuesField(WEIGHT_FIELD_NAME_2, 20 + i);
Field weight3 = new NumericDocValuesField(WEIGHT_FIELD_NAME_3, 30 + i);
Field contexts = new StoredField(CONTEXTS_FIELD_NAME, new BytesRef("ctx_" + i + "_0"));
Document doc = new Document();
doc.add(field);
// even if payload is not required usually have it
if (usually()) {
Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i));
doc.add(payload);
}
doc.add(weight1);
doc.add(weight2);
doc.add(weight3);
doc.add(contexts);
for(int j = 1; j < atLeast(3); j++) {
contexts.setBytesValue(new BytesRef("ctx_" + i + "_" + j));
doc.add(contexts);
}
docs.put(field.stringValue(), doc);
}
return docs;
}
}