mirror of https://github.com/apache/lucene.git
LUCENE-8601: attributes added to IndexableFieldType during indexing will now be preserved in the index and accessible at search time via FieldInfo attributes
This commit is contained in:
parent
ec43d100d1
commit
63dfba4c7d
|
@ -260,6 +260,10 @@ New Features
|
|||
IndexWriterConfig#setIndexCreatedVersionMajor. This is an expert feature.
|
||||
(Adrien Grand)
|
||||
|
||||
* LUCENE-8601: Attributes set in the IndexableFieldType for each field during indexing will
|
||||
now be recorded into the corresponding FieldInfo's attributes, accessible at search
|
||||
time (Murali Krishna P)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-8463: TopFieldCollector can now early-terminates queries when sorting by SortField.DOC.
|
||||
|
|
|
@ -135,7 +135,8 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
|
|||
|
||||
// Group each consumer by the fields it handles
|
||||
for (FieldInfo fi : mergeState.mergeFieldInfos) {
|
||||
DocValuesConsumer consumer = getInstance(fi);
|
||||
// merge should ignore current format for the fields being merged
|
||||
DocValuesConsumer consumer = getInstance(fi, true);
|
||||
Collection<String> fieldsForConsumer = consumersToField.get(consumer);
|
||||
if (fieldsForConsumer == null) {
|
||||
fieldsForConsumer = new ArrayList<>();
|
||||
|
@ -156,9 +157,23 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
|
|||
}
|
||||
|
||||
private DocValuesConsumer getInstance(FieldInfo field) throws IOException {
|
||||
return getInstance(field, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* DocValuesConsumer for the given field.
|
||||
* @param field - FieldInfo object.
|
||||
* @param ignoreCurrentFormat - ignore the existing format attributes.
|
||||
* @return DocValuesConsumer for the field.
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
private DocValuesConsumer getInstance(FieldInfo field, boolean ignoreCurrentFormat) throws IOException {
|
||||
DocValuesFormat format = null;
|
||||
if (field.getDocValuesGen() != -1) {
|
||||
final String formatName = field.getAttribute(PER_FIELD_FORMAT_KEY);
|
||||
String formatName = null;
|
||||
if (ignoreCurrentFormat == false) {
|
||||
formatName = field.getAttribute(PER_FIELD_FORMAT_KEY);
|
||||
}
|
||||
// this means the field never existed in that segment, yet is applied updates
|
||||
if (formatName != null) {
|
||||
format = DocValuesFormat.forName(formatName);
|
||||
|
@ -171,21 +186,19 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
|
|||
throw new IllegalStateException("invalid null DocValuesFormat for field=\"" + field.name + "\"");
|
||||
}
|
||||
final String formatName = format.getName();
|
||||
|
||||
String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
|
||||
if (field.getDocValuesGen() == -1 && previousValue != null) {
|
||||
throw new IllegalStateException("found existing value for " + PER_FIELD_FORMAT_KEY +
|
||||
", field=" + field.name + ", old=" + previousValue + ", new=" + formatName);
|
||||
}
|
||||
|
||||
|
||||
field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
|
||||
Integer suffix = null;
|
||||
|
||||
|
||||
ConsumerAndSuffix consumer = formats.get(format);
|
||||
if (consumer == null) {
|
||||
// First time we are seeing this format; create a new instance
|
||||
|
||||
if (field.getDocValuesGen() != -1) {
|
||||
final String suffixAtt = field.getAttribute(PER_FIELD_SUFFIX_KEY);
|
||||
String suffixAtt = null;
|
||||
if (!ignoreCurrentFormat) {
|
||||
suffixAtt = field.getAttribute(PER_FIELD_SUFFIX_KEY);
|
||||
}
|
||||
// even when dvGen is != -1, it can still be a new field, that never
|
||||
// existed in the segment, and therefore doesn't have the recorded
|
||||
// attributes yet.
|
||||
|
@ -193,7 +206,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
|
|||
suffix = Integer.valueOf(suffixAtt);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (suffix == null) {
|
||||
// bump the suffix
|
||||
suffix = suffixes.get(formatName);
|
||||
|
@ -204,7 +217,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
|
|||
}
|
||||
}
|
||||
suffixes.put(formatName, suffix);
|
||||
|
||||
|
||||
final String segmentSuffix = getFullSegmentSuffix(segmentWriteState.segmentSuffix,
|
||||
getSuffix(formatName, Integer.toString(suffix)));
|
||||
consumer = new ConsumerAndSuffix();
|
||||
|
@ -216,13 +229,8 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
|
|||
assert suffixes.containsKey(formatName);
|
||||
suffix = consumer.suffix;
|
||||
}
|
||||
|
||||
previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
|
||||
if (field.getDocValuesGen() == -1 && previousValue != null) {
|
||||
throw new IllegalStateException("found existing value for " + PER_FIELD_SUFFIX_KEY +
|
||||
", field=" + field.name + ", old=" + previousValue + ", new=" + suffix);
|
||||
}
|
||||
|
||||
field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
|
||||
// TODO: we should only provide the "slice" of FIS
|
||||
// that this DVF actually sees ...
|
||||
return consumer.consumer;
|
||||
|
|
|
@ -188,14 +188,14 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
|||
// Assign field -> PostingsFormat
|
||||
for(String field : indexedFieldNames) {
|
||||
FieldInfo fieldInfo = writeState.fieldInfos.fieldInfo(field);
|
||||
|
||||
// TODO: This should check current format from the field attribute?
|
||||
final PostingsFormat format = getPostingsFormatForField(field);
|
||||
|
||||
|
||||
if (format == null) {
|
||||
throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field + "\"");
|
||||
}
|
||||
String formatName = format.getName();
|
||||
|
||||
|
||||
FieldsGroup group = formatToGroups.get(format);
|
||||
if (group == null) {
|
||||
// First time we are seeing this format; create a
|
||||
|
@ -226,17 +226,8 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
|||
|
||||
group.fields.add(field);
|
||||
|
||||
String previousValue = fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
|
||||
if (previousValue != null) {
|
||||
throw new IllegalStateException("found existing value for " + PER_FIELD_FORMAT_KEY +
|
||||
", field=" + fieldInfo.name + ", old=" + previousValue + ", new=" + formatName);
|
||||
}
|
||||
|
||||
previousValue = fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix));
|
||||
if (previousValue != null) {
|
||||
throw new IllegalStateException("found existing value for " + PER_FIELD_SUFFIX_KEY +
|
||||
", field=" + fieldInfo.name + ", old=" + previousValue + ", new=" + group.suffix);
|
||||
}
|
||||
fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
|
||||
fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix));
|
||||
}
|
||||
return formatToGroups;
|
||||
}
|
||||
|
|
|
@ -17,6 +17,9 @@
|
|||
package org.apache.lucene.document;
|
||||
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer; // javadocs
|
||||
import org.apache.lucene.index.DocValuesType;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
|
@ -41,6 +44,7 @@ public class FieldType implements IndexableFieldType {
|
|||
private int dataDimensionCount;
|
||||
private int indexDimensionCount;
|
||||
private int dimensionNumBytes;
|
||||
private Map<String, String> attributes;
|
||||
|
||||
/**
|
||||
* Create a new mutable FieldType with all of the properties from <code>ref</code>
|
||||
|
@ -58,6 +62,9 @@ public class FieldType implements IndexableFieldType {
|
|||
this.dataDimensionCount = ref.pointDataDimensionCount();
|
||||
this.indexDimensionCount = ref.pointIndexDimensionCount();
|
||||
this.dimensionNumBytes = ref.pointNumBytes();
|
||||
if (ref.getAttributes() != null) {
|
||||
this.attributes = new HashMap<>(ref.getAttributes());
|
||||
}
|
||||
// Do not copy frozen!
|
||||
}
|
||||
|
||||
|
@ -341,6 +348,30 @@ public class FieldType implements IndexableFieldType {
|
|||
return dimensionNumBytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Puts an attribute value.
|
||||
* <p>
|
||||
* This is a key-value mapping for the field that the codec can use
|
||||
* to store additional metadata.
|
||||
* <p>
|
||||
* If a value already exists for the field, it will be replaced with
|
||||
* the new value. This method is not thread-safe, user must not add attributes
|
||||
* while other threads are indexing documents with this field type.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public String putAttribute(String key, String value) {
|
||||
if (attributes == null) {
|
||||
attributes = new HashMap<>();
|
||||
}
|
||||
return attributes.put(key, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> getAttributes() {
|
||||
return attributes;
|
||||
}
|
||||
|
||||
/** Prints a Field for human consumption. */
|
||||
@Override
|
||||
public String toString() {
|
||||
|
|
|
@ -661,6 +661,11 @@ final class DefaultIndexingChain extends DocConsumer {
|
|||
|
||||
FieldInfo fi = fieldInfos.getOrAdd(name);
|
||||
initIndexOptions(fi, fieldType.indexOptions());
|
||||
Map<String, String> attributes = fieldType.getAttributes();
|
||||
if (attributes != null) {
|
||||
attributes.forEach((k, v) -> fi.putAttribute(k, v));
|
||||
}
|
||||
|
||||
fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert);
|
||||
fp.next = fieldHash[hashPos];
|
||||
fieldHash[hashPos] = fp;
|
||||
|
|
|
@ -142,7 +142,7 @@ public final class FieldInfo {
|
|||
|
||||
// should only be called by FieldInfos#addOrUpdate
|
||||
void update(boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions,
|
||||
int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes) {
|
||||
Map<String, String> attributes, int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes) {
|
||||
if (indexOptions == null) {
|
||||
throw new NullPointerException("IndexOptions must not be null (field: \"" + name + "\")");
|
||||
}
|
||||
|
@ -176,6 +176,9 @@ public final class FieldInfo {
|
|||
// cannot store payloads if we don't store positions:
|
||||
this.storePayloads = false;
|
||||
}
|
||||
if (attributes != null) {
|
||||
this.attributes.putAll(attributes);
|
||||
}
|
||||
assert checkConsistency();
|
||||
}
|
||||
|
||||
|
@ -346,8 +349,9 @@ public final class FieldInfo {
|
|||
* to store additional metadata, and will be available to the codec
|
||||
* when reading the segment via {@link #getAttribute(String)}
|
||||
* <p>
|
||||
* If a value already exists for the field, it will be replaced with
|
||||
* the new value.
|
||||
* If a value already exists for the key in the field, it will be replaced with
|
||||
* the new value. If the value of the attributes for a same field is changed between
|
||||
* the documents, the behaviour after merge is undefined.
|
||||
*/
|
||||
public String putAttribute(String key, String value) {
|
||||
return attributes.put(key, value);
|
||||
|
|
|
@ -506,12 +506,18 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
boolean storeTermVector,
|
||||
boolean omitNorms, boolean storePayloads, IndexOptions indexOptions,
|
||||
DocValuesType docValues, long dvGen,
|
||||
Map<String, String> attributes,
|
||||
int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes,
|
||||
boolean isSoftDeletesField) {
|
||||
assert assertNotFinished();
|
||||
if (docValues == null) {
|
||||
throw new NullPointerException("DocValuesType must not be null");
|
||||
}
|
||||
if (attributes != null) {
|
||||
// original attributes is UnmodifiableMap
|
||||
attributes = new HashMap<>(attributes);
|
||||
}
|
||||
|
||||
FieldInfo fi = fieldInfo(name);
|
||||
if (fi == null) {
|
||||
// This field wasn't yet added to this in-RAM
|
||||
|
@ -520,12 +526,12 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
// before then we'll get the same name and number,
|
||||
// else we'll allocate a new one:
|
||||
final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
|
||||
fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
|
||||
fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
|
||||
assert !byName.containsKey(fi.name);
|
||||
globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType());
|
||||
byName.put(fi.name, fi);
|
||||
} else {
|
||||
fi.update(storeTermVector, omitNorms, storePayloads, indexOptions, dataDimensionCount, indexDimensionCount, dimensionNumBytes);
|
||||
fi.update(storeTermVector, omitNorms, storePayloads, indexOptions, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes);
|
||||
|
||||
if (docValues != DocValuesType.NONE) {
|
||||
// Only pay the synchronization cost if fi does not already have a DVType
|
||||
|
@ -553,6 +559,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
|
|||
return addOrUpdateInternal(fi.name, fi.number, fi.hasVectors(),
|
||||
fi.omitsNorms(), fi.hasPayloads(),
|
||||
fi.getIndexOptions(), fi.getDocValuesType(), dvGen,
|
||||
fi.attributes(),
|
||||
fi.getPointDataDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
|
||||
fi.isSoftDeletesField());
|
||||
}
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer; // javadocs
|
||||
|
||||
/**
|
||||
|
@ -111,4 +113,14 @@ public interface IndexableFieldType {
|
|||
* The number of bytes in each dimension's values.
|
||||
*/
|
||||
public int pointNumBytes();
|
||||
|
||||
/**
|
||||
* Attributes for the field type.
|
||||
*
|
||||
* Attributes are not thread-safe, user must not add attributes while other threads are indexing documents
|
||||
* with this field type.
|
||||
*
|
||||
* @return Map
|
||||
*/
|
||||
public Map<String, String> getAttributes();
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Iterator;
|
|||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -93,6 +94,57 @@ public class TestFieldInfos extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
public void testFieldAttributes() throws Exception{
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))
|
||||
.setMergePolicy(NoMergePolicy.INSTANCE));
|
||||
|
||||
FieldType type1 = new FieldType();
|
||||
type1.setStored(true);
|
||||
type1.putAttribute("testKey1", "testValue1");
|
||||
|
||||
Document d1 = new Document();
|
||||
d1.add(new Field("f1", "v1", type1));
|
||||
FieldType type2 = new FieldType(type1);
|
||||
//changing the value after copying shouldn't impact the original type1
|
||||
type2.putAttribute("testKey1", "testValue2");
|
||||
writer.addDocument(d1);
|
||||
writer.commit();
|
||||
|
||||
Document d2 = new Document();
|
||||
type1.putAttribute("testKey1", "testValueX");
|
||||
type1.putAttribute("testKey2", "testValue2");
|
||||
d2.add(new Field("f1", "v2", type1));
|
||||
d2.add(new Field("f2", "v2", type2));
|
||||
writer.addDocument(d2);
|
||||
writer.commit();
|
||||
writer.forceMerge(1);
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
FieldInfos fis = FieldInfos.getMergedFieldInfos(reader);
|
||||
assertEquals(fis.size(), 2);
|
||||
Iterator<FieldInfo> it = fis.iterator();
|
||||
while(it.hasNext()) {
|
||||
FieldInfo fi = it.next();
|
||||
switch (fi.name) {
|
||||
case "f1":
|
||||
// testKey1 can point to either testValue1 or testValueX based on the order
|
||||
// of merge, but we see textValueX winning here since segment_2 is merged on segment_1.
|
||||
assertEquals("testValueX", fi.getAttribute("testKey1"));
|
||||
assertEquals("testValue2", fi.getAttribute("testKey2"));
|
||||
break;
|
||||
case "f2":
|
||||
assertEquals("testValue2", fi.getAttribute("testKey1"));
|
||||
break;
|
||||
default:
|
||||
assertFalse("Unknown field", true);
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testMergedFieldInfos_empty() throws IOException {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.Reader;
|
|||
import java.io.StringReader;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -104,6 +105,11 @@ public class TestIndexableField extends LuceneTestCase {
|
|||
public int pointNumBytes() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> getAttributes() {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
public MyField(int counter) {
|
||||
|
|
|
@ -430,4 +430,9 @@ public final class SchemaField extends FieldProperties implements IndexableField
|
|||
public int pointNumBytes() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> getAttributes() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue