LUCENE-8601: attributes added to IndexableFieldType during indexing will now be preserved in the index and accessible at search time via FieldInfo attributes

This commit is contained in:
Mike McCandless 2019-01-03 18:44:41 -05:00
parent ec43d100d1
commit 63dfba4c7d
11 changed files with 163 additions and 38 deletions

View File

@ -260,6 +260,10 @@ New Features
IndexWriterConfig#setIndexCreatedVersionMajor. This is an expert feature.
(Adrien Grand)
* LUCENE-8601: Attributes set in the IndexableFieldType for each field during indexing will
now be recorded into the corresponding FieldInfo's attributes, accessible at search
time (Murali Krishna P)
Improvements
* LUCENE-8463: TopFieldCollector can now early-terminates queries when sorting by SortField.DOC.

View File

@ -135,7 +135,8 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
// Group each consumer by the fields it handles
for (FieldInfo fi : mergeState.mergeFieldInfos) {
DocValuesConsumer consumer = getInstance(fi);
// merge should ignore current format for the fields being merged
DocValuesConsumer consumer = getInstance(fi, true);
Collection<String> fieldsForConsumer = consumersToField.get(consumer);
if (fieldsForConsumer == null) {
fieldsForConsumer = new ArrayList<>();
@ -156,9 +157,23 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
}
private DocValuesConsumer getInstance(FieldInfo field) throws IOException {
return getInstance(field, false);
}
/**
* DocValuesConsumer for the given field.
* @param field - FieldInfo object.
* @param ignoreCurrentFormat - ignore the existing format attributes.
* @return DocValuesConsumer for the field.
* @throws IOException if there is a low-level IO error
*/
private DocValuesConsumer getInstance(FieldInfo field, boolean ignoreCurrentFormat) throws IOException {
DocValuesFormat format = null;
if (field.getDocValuesGen() != -1) {
final String formatName = field.getAttribute(PER_FIELD_FORMAT_KEY);
String formatName = null;
if (ignoreCurrentFormat == false) {
formatName = field.getAttribute(PER_FIELD_FORMAT_KEY);
}
// this means the field never existed in that segment, yet is applied updates
if (formatName != null) {
format = DocValuesFormat.forName(formatName);
@ -171,21 +186,19 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
throw new IllegalStateException("invalid null DocValuesFormat for field=\"" + field.name + "\"");
}
final String formatName = format.getName();
String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
if (field.getDocValuesGen() == -1 && previousValue != null) {
throw new IllegalStateException("found existing value for " + PER_FIELD_FORMAT_KEY +
", field=" + field.name + ", old=" + previousValue + ", new=" + formatName);
}
field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
Integer suffix = null;
ConsumerAndSuffix consumer = formats.get(format);
if (consumer == null) {
// First time we are seeing this format; create a new instance
if (field.getDocValuesGen() != -1) {
final String suffixAtt = field.getAttribute(PER_FIELD_SUFFIX_KEY);
String suffixAtt = null;
if (!ignoreCurrentFormat) {
suffixAtt = field.getAttribute(PER_FIELD_SUFFIX_KEY);
}
// even when dvGen is != -1, it can still be a new field, that never
// existed in the segment, and therefore doesn't have the recorded
// attributes yet.
@ -193,7 +206,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
suffix = Integer.valueOf(suffixAtt);
}
}
if (suffix == null) {
// bump the suffix
suffix = suffixes.get(formatName);
@ -204,7 +217,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
}
}
suffixes.put(formatName, suffix);
final String segmentSuffix = getFullSegmentSuffix(segmentWriteState.segmentSuffix,
getSuffix(formatName, Integer.toString(suffix)));
consumer = new ConsumerAndSuffix();
@ -216,13 +229,8 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
assert suffixes.containsKey(formatName);
suffix = consumer.suffix;
}
previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
if (field.getDocValuesGen() == -1 && previousValue != null) {
throw new IllegalStateException("found existing value for " + PER_FIELD_SUFFIX_KEY +
", field=" + field.name + ", old=" + previousValue + ", new=" + suffix);
}
field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
// TODO: we should only provide the "slice" of FIS
// that this DVF actually sees ...
return consumer.consumer;

View File

@ -188,14 +188,14 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
// Assign field -> PostingsFormat
for(String field : indexedFieldNames) {
FieldInfo fieldInfo = writeState.fieldInfos.fieldInfo(field);
// TODO: This should check current format from the field attribute?
final PostingsFormat format = getPostingsFormatForField(field);
if (format == null) {
throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field + "\"");
}
String formatName = format.getName();
FieldsGroup group = formatToGroups.get(format);
if (group == null) {
// First time we are seeing this format; create a
@ -226,17 +226,8 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
group.fields.add(field);
String previousValue = fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
if (previousValue != null) {
throw new IllegalStateException("found existing value for " + PER_FIELD_FORMAT_KEY +
", field=" + fieldInfo.name + ", old=" + previousValue + ", new=" + formatName);
}
previousValue = fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix));
if (previousValue != null) {
throw new IllegalStateException("found existing value for " + PER_FIELD_SUFFIX_KEY +
", field=" + fieldInfo.name + ", old=" + previousValue + ", new=" + group.suffix);
}
fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix));
}
return formatToGroups;
}

View File

@ -17,6 +17,9 @@
package org.apache.lucene.document;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer; // javadocs
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
@ -41,6 +44,7 @@ public class FieldType implements IndexableFieldType {
private int dataDimensionCount;
private int indexDimensionCount;
private int dimensionNumBytes;
private Map<String, String> attributes;
/**
* Create a new mutable FieldType with all of the properties from <code>ref</code>
@ -58,6 +62,9 @@ public class FieldType implements IndexableFieldType {
this.dataDimensionCount = ref.pointDataDimensionCount();
this.indexDimensionCount = ref.pointIndexDimensionCount();
this.dimensionNumBytes = ref.pointNumBytes();
if (ref.getAttributes() != null) {
this.attributes = new HashMap<>(ref.getAttributes());
}
// Do not copy frozen!
}
@ -341,6 +348,30 @@ public class FieldType implements IndexableFieldType {
return dimensionNumBytes;
}
/**
* Puts an attribute value.
* <p>
* This is a key-value mapping for the field that the codec can use
* to store additional metadata.
* <p>
* If a value already exists for the field, it will be replaced with
* the new value. This method is not thread-safe, user must not add attributes
* while other threads are indexing documents with this field type.
*
* @lucene.experimental
*/
public String putAttribute(String key, String value) {
if (attributes == null) {
attributes = new HashMap<>();
}
return attributes.put(key, value);
}
@Override
public Map<String, String> getAttributes() {
return attributes;
}
/** Prints a Field for human consumption. */
@Override
public String toString() {

View File

@ -661,6 +661,11 @@ final class DefaultIndexingChain extends DocConsumer {
FieldInfo fi = fieldInfos.getOrAdd(name);
initIndexOptions(fi, fieldType.indexOptions());
Map<String, String> attributes = fieldType.getAttributes();
if (attributes != null) {
attributes.forEach((k, v) -> fi.putAttribute(k, v));
}
fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert);
fp.next = fieldHash[hashPos];
fieldHash[hashPos] = fp;

View File

@ -142,7 +142,7 @@ public final class FieldInfo {
// should only be called by FieldInfos#addOrUpdate
void update(boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions,
int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes) {
Map<String, String> attributes, int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes) {
if (indexOptions == null) {
throw new NullPointerException("IndexOptions must not be null (field: \"" + name + "\")");
}
@ -176,6 +176,9 @@ public final class FieldInfo {
// cannot store payloads if we don't store positions:
this.storePayloads = false;
}
if (attributes != null) {
this.attributes.putAll(attributes);
}
assert checkConsistency();
}
@ -346,8 +349,9 @@ public final class FieldInfo {
* to store additional metadata, and will be available to the codec
* when reading the segment via {@link #getAttribute(String)}
* <p>
* If a value already exists for the field, it will be replaced with
* the new value.
* If a value already exists for the key in the field, it will be replaced with
* the new value. If the value of the attributes for a same field is changed between
* the documents, the behaviour after merge is undefined.
*/
public String putAttribute(String key, String value) {
return attributes.put(key, value);

View File

@ -506,12 +506,18 @@ public class FieldInfos implements Iterable<FieldInfo> {
boolean storeTermVector,
boolean omitNorms, boolean storePayloads, IndexOptions indexOptions,
DocValuesType docValues, long dvGen,
Map<String, String> attributes,
int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes,
boolean isSoftDeletesField) {
assert assertNotFinished();
if (docValues == null) {
throw new NullPointerException("DocValuesType must not be null");
}
if (attributes != null) {
// original attributes is UnmodifiableMap
attributes = new HashMap<>(attributes);
}
FieldInfo fi = fieldInfo(name);
if (fi == null) {
// This field wasn't yet added to this in-RAM
@ -520,12 +526,12 @@ public class FieldInfos implements Iterable<FieldInfo> {
// before then we'll get the same name and number,
// else we'll allocate a new one:
final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
assert !byName.containsKey(fi.name);
globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType());
byName.put(fi.name, fi);
} else {
fi.update(storeTermVector, omitNorms, storePayloads, indexOptions, dataDimensionCount, indexDimensionCount, dimensionNumBytes);
fi.update(storeTermVector, omitNorms, storePayloads, indexOptions, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes);
if (docValues != DocValuesType.NONE) {
// Only pay the synchronization cost if fi does not already have a DVType
@ -553,6 +559,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
return addOrUpdateInternal(fi.name, fi.number, fi.hasVectors(),
fi.omitsNorms(), fi.hasPayloads(),
fi.getIndexOptions(), fi.getDocValuesType(), dvGen,
fi.attributes(),
fi.getPointDataDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
fi.isSoftDeletesField());
}

View File

@ -17,6 +17,8 @@
package org.apache.lucene.index;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer; // javadocs
/**
@ -111,4 +113,14 @@ public interface IndexableFieldType {
* The number of bytes in each dimension's values.
*/
public int pointNumBytes();
/**
* Attributes for the field type.
*
* Attributes are not thread-safe, user must not add attributes while other threads are indexing documents
* with this field type.
*
* @return Map
*/
public Map<String, String> getAttributes();
}

View File

@ -23,6 +23,7 @@ import java.util.Iterator;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@ -93,6 +94,57 @@ public class TestFieldInfos extends LuceneTestCase {
dir.close();
}
public void testFieldAttributes() throws Exception{
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))
.setMergePolicy(NoMergePolicy.INSTANCE));
FieldType type1 = new FieldType();
type1.setStored(true);
type1.putAttribute("testKey1", "testValue1");
Document d1 = new Document();
d1.add(new Field("f1", "v1", type1));
FieldType type2 = new FieldType(type1);
//changing the value after copying shouldn't impact the original type1
type2.putAttribute("testKey1", "testValue2");
writer.addDocument(d1);
writer.commit();
Document d2 = new Document();
type1.putAttribute("testKey1", "testValueX");
type1.putAttribute("testKey2", "testValue2");
d2.add(new Field("f1", "v2", type1));
d2.add(new Field("f2", "v2", type2));
writer.addDocument(d2);
writer.commit();
writer.forceMerge(1);
IndexReader reader = writer.getReader();
FieldInfos fis = FieldInfos.getMergedFieldInfos(reader);
assertEquals(fis.size(), 2);
Iterator<FieldInfo> it = fis.iterator();
while(it.hasNext()) {
FieldInfo fi = it.next();
switch (fi.name) {
case "f1":
// testKey1 can point to either testValue1 or testValueX based on the order
// of merge, but we see textValueX winning here since segment_2 is merged on segment_1.
assertEquals("testValueX", fi.getAttribute("testKey1"));
assertEquals("testValue2", fi.getAttribute("testKey2"));
break;
case "f2":
assertEquals("testValue2", fi.getAttribute("testKey1"));
break;
default:
assertFalse("Unknown field", true);
}
}
reader.close();
writer.close();
dir.close();
}
public void testMergedFieldInfos_empty() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));

View File

@ -21,6 +21,7 @@ import java.io.Reader;
import java.io.StringReader;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
@ -104,6 +105,11 @@ public class TestIndexableField extends LuceneTestCase {
public int pointNumBytes() {
return 0;
}
@Override
public Map<String, String> getAttributes() {
return null;
}
};
public MyField(int counter) {

View File

@ -430,4 +430,9 @@ public final class SchemaField extends FieldProperties implements IndexableField
public int pointNumBytes() {
return 0;
}
@Override
public Map<String, String> getAttributes() {
return null;
}
}