diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e8820cb0325..c68af3ecd6f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -260,6 +260,10 @@ New Features IndexWriterConfig#setIndexCreatedVersionMajor. This is an expert feature. (Adrien Grand) +* LUCENE-8601: Attributes set in the IndexableFieldType for each field during indexing will + now be recorded into the corresponding FieldInfo's attributes, accessible at search + time (Murali Krishna P) + Improvements * LUCENE-8463: TopFieldCollector can now early-terminates queries when sorting by SortField.DOC. diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java index eef232cfcd1..f439699e230 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java @@ -135,7 +135,8 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { // Group each consumer by the fields it handles for (FieldInfo fi : mergeState.mergeFieldInfos) { - DocValuesConsumer consumer = getInstance(fi); + // merge should ignore current format for the fields being merged + DocValuesConsumer consumer = getInstance(fi, true); Collection fieldsForConsumer = consumersToField.get(consumer); if (fieldsForConsumer == null) { fieldsForConsumer = new ArrayList<>(); @@ -156,9 +157,23 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { } private DocValuesConsumer getInstance(FieldInfo field) throws IOException { + return getInstance(field, false); + } + + /** + * DocValuesConsumer for the given field. + * @param field - FieldInfo object. + * @param ignoreCurrentFormat - ignore the existing format attributes. + * @return DocValuesConsumer for the field. + * @throws IOException if there is a low-level IO error + */ + private DocValuesConsumer getInstance(FieldInfo field, boolean ignoreCurrentFormat) throws IOException { DocValuesFormat format = null; if (field.getDocValuesGen() != -1) { - final String formatName = field.getAttribute(PER_FIELD_FORMAT_KEY); + String formatName = null; + if (ignoreCurrentFormat == false) { + formatName = field.getAttribute(PER_FIELD_FORMAT_KEY); + } // this means the field never existed in that segment, yet is applied updates if (formatName != null) { format = DocValuesFormat.forName(formatName); @@ -171,21 +186,19 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { throw new IllegalStateException("invalid null DocValuesFormat for field=\"" + field.name + "\""); } final String formatName = format.getName(); - - String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName); - if (field.getDocValuesGen() == -1 && previousValue != null) { - throw new IllegalStateException("found existing value for " + PER_FIELD_FORMAT_KEY + - ", field=" + field.name + ", old=" + previousValue + ", new=" + formatName); - } - + + field.putAttribute(PER_FIELD_FORMAT_KEY, formatName); Integer suffix = null; - + ConsumerAndSuffix consumer = formats.get(format); if (consumer == null) { // First time we are seeing this format; create a new instance if (field.getDocValuesGen() != -1) { - final String suffixAtt = field.getAttribute(PER_FIELD_SUFFIX_KEY); + String suffixAtt = null; + if (!ignoreCurrentFormat) { + suffixAtt = field.getAttribute(PER_FIELD_SUFFIX_KEY); + } // even when dvGen is != -1, it can still be a new field, that never // existed in the segment, and therefore doesn't have the recorded // attributes yet. @@ -193,7 +206,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { suffix = Integer.valueOf(suffixAtt); } } - + if (suffix == null) { // bump the suffix suffix = suffixes.get(formatName); @@ -204,7 +217,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { } } suffixes.put(formatName, suffix); - + final String segmentSuffix = getFullSegmentSuffix(segmentWriteState.segmentSuffix, getSuffix(formatName, Integer.toString(suffix))); consumer = new ConsumerAndSuffix(); @@ -216,13 +229,8 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { assert suffixes.containsKey(formatName); suffix = consumer.suffix; } - - previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix)); - if (field.getDocValuesGen() == -1 && previousValue != null) { - throw new IllegalStateException("found existing value for " + PER_FIELD_SUFFIX_KEY + - ", field=" + field.name + ", old=" + previousValue + ", new=" + suffix); - } + field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix)); // TODO: we should only provide the "slice" of FIS // that this DVF actually sees ... return consumer.consumer; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java index 9ac0fe2f5a4..88ae6da0b2b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java @@ -188,14 +188,14 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat { // Assign field -> PostingsFormat for(String field : indexedFieldNames) { FieldInfo fieldInfo = writeState.fieldInfos.fieldInfo(field); - + // TODO: This should check current format from the field attribute? final PostingsFormat format = getPostingsFormatForField(field); - + if (format == null) { throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field + "\""); } String formatName = format.getName(); - + FieldsGroup group = formatToGroups.get(format); if (group == null) { // First time we are seeing this format; create a @@ -226,17 +226,8 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat { group.fields.add(field); - String previousValue = fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName); - if (previousValue != null) { - throw new IllegalStateException("found existing value for " + PER_FIELD_FORMAT_KEY + - ", field=" + fieldInfo.name + ", old=" + previousValue + ", new=" + formatName); - } - - previousValue = fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix)); - if (previousValue != null) { - throw new IllegalStateException("found existing value for " + PER_FIELD_SUFFIX_KEY + - ", field=" + fieldInfo.name + ", old=" + previousValue + ", new=" + group.suffix); - } + fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName); + fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix)); } return formatToGroups; } diff --git a/lucene/core/src/java/org/apache/lucene/document/FieldType.java b/lucene/core/src/java/org/apache/lucene/document/FieldType.java index a21572e96ca..b450eee385e 100644 --- a/lucene/core/src/java/org/apache/lucene/document/FieldType.java +++ b/lucene/core/src/java/org/apache/lucene/document/FieldType.java @@ -17,6 +17,9 @@ package org.apache.lucene.document; +import java.util.HashMap; +import java.util.Map; + import org.apache.lucene.analysis.Analyzer; // javadocs import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; @@ -41,6 +44,7 @@ public class FieldType implements IndexableFieldType { private int dataDimensionCount; private int indexDimensionCount; private int dimensionNumBytes; + private Map attributes; /** * Create a new mutable FieldType with all of the properties from ref @@ -58,6 +62,9 @@ public class FieldType implements IndexableFieldType { this.dataDimensionCount = ref.pointDataDimensionCount(); this.indexDimensionCount = ref.pointIndexDimensionCount(); this.dimensionNumBytes = ref.pointNumBytes(); + if (ref.getAttributes() != null) { + this.attributes = new HashMap<>(ref.getAttributes()); + } // Do not copy frozen! } @@ -341,6 +348,30 @@ public class FieldType implements IndexableFieldType { return dimensionNumBytes; } + /** + * Puts an attribute value. + *

+ * This is a key-value mapping for the field that the codec can use + * to store additional metadata. + *

+ * If a value already exists for the field, it will be replaced with + * the new value. This method is not thread-safe, user must not add attributes + * while other threads are indexing documents with this field type. + * + * @lucene.experimental + */ + public String putAttribute(String key, String value) { + if (attributes == null) { + attributes = new HashMap<>(); + } + return attributes.put(key, value); + } + + @Override + public Map getAttributes() { + return attributes; + } + /** Prints a Field for human consumption. */ @Override public String toString() { diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java index 4cc981dbd7c..8c4145fce20 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java @@ -661,6 +661,11 @@ final class DefaultIndexingChain extends DocConsumer { FieldInfo fi = fieldInfos.getOrAdd(name); initIndexOptions(fi, fieldType.indexOptions()); + Map attributes = fieldType.getAttributes(); + if (attributes != null) { + attributes.forEach((k, v) -> fi.putAttribute(k, v)); + } + fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert); fp.next = fieldHash[hashPos]; fieldHash[hashPos] = fp; diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index c5d85bc3ea5..534652363f1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -142,7 +142,7 @@ public final class FieldInfo { // should only be called by FieldInfos#addOrUpdate void update(boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, - int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes) { + Map attributes, int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes) { if (indexOptions == null) { throw new NullPointerException("IndexOptions must not be null (field: \"" + name + "\")"); } @@ -176,6 +176,9 @@ public final class FieldInfo { // cannot store payloads if we don't store positions: this.storePayloads = false; } + if (attributes != null) { + this.attributes.putAll(attributes); + } assert checkConsistency(); } @@ -346,8 +349,9 @@ public final class FieldInfo { * to store additional metadata, and will be available to the codec * when reading the segment via {@link #getAttribute(String)} *

- * If a value already exists for the field, it will be replaced with - * the new value. + * If a value already exists for the key in the field, it will be replaced with + * the new value. If the value of the attributes for a same field is changed between + * the documents, the behaviour after merge is undefined. */ public String putAttribute(String key, String value) { return attributes.put(key, value); diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index 88f092a9d1b..193fbdf5906 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -506,12 +506,18 @@ public class FieldInfos implements Iterable { boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, long dvGen, + Map attributes, int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) { assert assertNotFinished(); if (docValues == null) { throw new NullPointerException("DocValuesType must not be null"); } + if (attributes != null) { + // original attributes is UnmodifiableMap + attributes = new HashMap<>(attributes); + } + FieldInfo fi = fieldInfo(name); if (fi == null) { // This field wasn't yet added to this in-RAM @@ -520,12 +526,12 @@ public class FieldInfos implements Iterable { // before then we'll get the same name and number, // else we'll allocate a new one: final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField); - fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField); + fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType()); byName.put(fi.name, fi); } else { - fi.update(storeTermVector, omitNorms, storePayloads, indexOptions, dataDimensionCount, indexDimensionCount, dimensionNumBytes); + fi.update(storeTermVector, omitNorms, storePayloads, indexOptions, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes); if (docValues != DocValuesType.NONE) { // Only pay the synchronization cost if fi does not already have a DVType @@ -553,6 +559,7 @@ public class FieldInfos implements Iterable { return addOrUpdateInternal(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType(), dvGen, + fi.attributes(), fi.getPointDataDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java index b2b2e773ca1..59c5ab5192b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java @@ -17,6 +17,8 @@ package org.apache.lucene.index; +import java.util.Map; + import org.apache.lucene.analysis.Analyzer; // javadocs /** @@ -111,4 +113,14 @@ public interface IndexableFieldType { * The number of bytes in each dimension's values. */ public int pointNumBytes(); + + /** + * Attributes for the field type. + * + * Attributes are not thread-safe, user must not add attributes while other threads are indexing documents + * with this field type. + * + * @return Map + */ + public Map getAttributes(); } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java index 3fe5fa9d2c3..321fcf7dc68 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java @@ -23,6 +23,7 @@ import java.util.Iterator; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -93,6 +94,57 @@ public class TestFieldInfos extends LuceneTestCase { dir.close(); } + public void testFieldAttributes() throws Exception{ + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())) + .setMergePolicy(NoMergePolicy.INSTANCE)); + + FieldType type1 = new FieldType(); + type1.setStored(true); + type1.putAttribute("testKey1", "testValue1"); + + Document d1 = new Document(); + d1.add(new Field("f1", "v1", type1)); + FieldType type2 = new FieldType(type1); + //changing the value after copying shouldn't impact the original type1 + type2.putAttribute("testKey1", "testValue2"); + writer.addDocument(d1); + writer.commit(); + + Document d2 = new Document(); + type1.putAttribute("testKey1", "testValueX"); + type1.putAttribute("testKey2", "testValue2"); + d2.add(new Field("f1", "v2", type1)); + d2.add(new Field("f2", "v2", type2)); + writer.addDocument(d2); + writer.commit(); + writer.forceMerge(1); + + IndexReader reader = writer.getReader(); + FieldInfos fis = FieldInfos.getMergedFieldInfos(reader); + assertEquals(fis.size(), 2); + Iterator it = fis.iterator(); + while(it.hasNext()) { + FieldInfo fi = it.next(); + switch (fi.name) { + case "f1": + // testKey1 can point to either testValue1 or testValueX based on the order + // of merge, but we see textValueX winning here since segment_2 is merged on segment_1. + assertEquals("testValueX", fi.getAttribute("testKey1")); + assertEquals("testValue2", fi.getAttribute("testKey2")); + break; + case "f2": + assertEquals("testValue2", fi.getAttribute("testKey1")); + break; + default: + assertFalse("Unknown field", true); + } + } + reader.close(); + writer.close(); + dir.close(); + } + public void testMergedFieldInfos_empty() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java index 1091b2485e5..59dbae3cd78 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java @@ -21,6 +21,7 @@ import java.io.Reader; import java.io.StringReader; import java.util.Collections; import java.util.Iterator; +import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; @@ -104,6 +105,11 @@ public class TestIndexableField extends LuceneTestCase { public int pointNumBytes() { return 0; } + + @Override + public Map getAttributes() { + return null; + } }; public MyField(int counter) { diff --git a/solr/core/src/java/org/apache/solr/schema/SchemaField.java b/solr/core/src/java/org/apache/solr/schema/SchemaField.java index 7d9449e6d9a..100a963c1c1 100644 --- a/solr/core/src/java/org/apache/solr/schema/SchemaField.java +++ b/solr/core/src/java/org/apache/solr/schema/SchemaField.java @@ -430,4 +430,9 @@ public final class SchemaField extends FieldProperties implements IndexableField public int pointNumBytes() { return 0; } + + @Override + public Map getAttributes() { + return null; + } }