LUCENE-8601: attributes added to IndexableFieldType during indexing will now be preserved in the index and accessible at search time via FieldInfo attributes

2019-01-03 18:44:41 -05:00 · 2019-01-03 18:44:41 -05:00 · 63dfba4c7d
parent ec43d100d1
commit 63dfba4c7d
11 changed files with 163 additions and 38 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -260,6 +260,10 @@ New Features
  IndexWriterConfig#setIndexCreatedVersionMajor. This is an expert feature.
  (Adrien Grand)

+* LUCENE-8601: Attributes set in the IndexableFieldType for each field during indexing will
+  now be recorded into the corresponding FieldInfo's attributes, accessible at search
+  time (Murali Krishna P)
+
 Improvements

 * LUCENE-8463: TopFieldCollector can now early-terminates queries when sorting by SortField.DOC.
--- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java
@ -135,7 +135,8 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {

      // Group each consumer by the fields it handles
      for (FieldInfo fi : mergeState.mergeFieldInfos) {
-        DocValuesConsumer consumer = getInstance(fi);
+        // merge should ignore current format for the fields being merged
+        DocValuesConsumer consumer = getInstance(fi, true);
        Collection<String> fieldsForConsumer = consumersToField.get(consumer);
        if (fieldsForConsumer == null) {
          fieldsForConsumer = new ArrayList<>();
@ -156,9 +157,23 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
    }

    private DocValuesConsumer getInstance(FieldInfo field) throws IOException {
+      return getInstance(field, false);
+    }
+
+    /**
+     * DocValuesConsumer for the given field.
+     * @param field - FieldInfo object.
+     * @param ignoreCurrentFormat - ignore the existing format attributes.
+     * @return DocValuesConsumer for the field.
+     * @throws IOException if there is a low-level IO error
+     */
+    private DocValuesConsumer getInstance(FieldInfo field, boolean ignoreCurrentFormat) throws IOException {
      DocValuesFormat format = null;
      if (field.getDocValuesGen() != -1) {
-        final String formatName = field.getAttribute(PER_FIELD_FORMAT_KEY);
+        String formatName = null;
+        if (ignoreCurrentFormat == false) {
+          formatName = field.getAttribute(PER_FIELD_FORMAT_KEY);
+        }
        // this means the field never existed in that segment, yet is applied updates
        if (formatName != null) {
          format = DocValuesFormat.forName(formatName);
@ -172,12 +187,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
      }
      final String formatName = format.getName();

-      String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
-      if (field.getDocValuesGen() == -1 && previousValue != null) {
-        throw new IllegalStateException("found existing value for " + PER_FIELD_FORMAT_KEY + 
-                                        ", field=" + field.name + ", old=" + previousValue + ", new=" + formatName);
-      }
-      
+      field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
      Integer suffix = null;

      ConsumerAndSuffix consumer = formats.get(format);
@ -185,7 +195,10 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
        // First time we are seeing this format; create a new instance

        if (field.getDocValuesGen() != -1) {
-          final String suffixAtt = field.getAttribute(PER_FIELD_SUFFIX_KEY);
+          String suffixAtt = null;
+          if (!ignoreCurrentFormat) {
+            suffixAtt = field.getAttribute(PER_FIELD_SUFFIX_KEY);
+          }
          // even when dvGen is != -1, it can still be a new field, that never
          // existed in the segment, and therefore doesn't have the recorded
          // attributes yet.
@ -217,12 +230,7 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat {
        suffix = consumer.suffix;
      }

-      previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
-      if (field.getDocValuesGen() == -1 && previousValue != null) {
-        throw new IllegalStateException("found existing value for " + PER_FIELD_SUFFIX_KEY + 
-                                        ", field=" + field.name + ", old=" + previousValue + ", new=" + suffix);
-      }
-
+      field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
      // TODO: we should only provide the "slice" of FIS
      // that this DVF actually sees ...
      return consumer.consumer;
--- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
@ -188,7 +188,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
      // Assign field -> PostingsFormat
      for(String field : indexedFieldNames) {
        FieldInfo fieldInfo = writeState.fieldInfos.fieldInfo(field);
-
+        // TODO: This should check current format from the field attribute?
        final PostingsFormat format = getPostingsFormatForField(field);

        if (format == null) {
@ -226,17 +226,8 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {

        group.fields.add(field);

-        String previousValue = fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
-        if (previousValue != null) {
-          throw new IllegalStateException("found existing value for " + PER_FIELD_FORMAT_KEY + 
-                                          ", field=" + fieldInfo.name + ", old=" + previousValue + ", new=" + formatName);
-        }
-
-        previousValue = fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix));
-        if (previousValue != null) {
-          throw new IllegalStateException("found existing value for " + PER_FIELD_SUFFIX_KEY + 
-                                          ", field=" + fieldInfo.name + ", old=" + previousValue + ", new=" + group.suffix);
-        }
+        fieldInfo.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
+        fieldInfo.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(group.suffix));
      }
      return formatToGroups;
    }
--- a/lucene/core/src/java/org/apache/lucene/document/FieldType.java
+++ b/lucene/core/src/java/org/apache/lucene/document/FieldType.java
@ -17,6 +17,9 @@
 package org.apache.lucene.document;


+import java.util.HashMap;
+import java.util.Map;
+
 import org.apache.lucene.analysis.Analyzer; // javadocs
 import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.IndexOptions;
@ -41,6 +44,7 @@ public class FieldType implements IndexableFieldType  {
  private int dataDimensionCount;
  private int indexDimensionCount;
  private int dimensionNumBytes;
+  private Map<String, String> attributes;

  /**
   * Create a new mutable FieldType with all of the properties from <code>ref</code>
@ -58,6 +62,9 @@ public class FieldType implements IndexableFieldType  {
    this.dataDimensionCount = ref.pointDataDimensionCount();
    this.indexDimensionCount = ref.pointIndexDimensionCount();
    this.dimensionNumBytes = ref.pointNumBytes();
+    if (ref.getAttributes() != null) {
+      this.attributes = new HashMap<>(ref.getAttributes());
+    }
    // Do not copy frozen!
  }
  
@ -341,6 +348,30 @@ public class FieldType implements IndexableFieldType  {
    return dimensionNumBytes;
  }

+  /**
+   * Puts an attribute value.
+   * <p>
+   * This is a key-value mapping for the field that the codec can use
+   * to store additional metadata.
+   * <p>
+   * If a value already exists for the field, it will be replaced with
+   * the new value. This method is not thread-safe, user must not add attributes
+   * while other threads are indexing documents with this field type.
+   *
+   * @lucene.experimental
+   */
+  public String putAttribute(String key, String value) {
+    if (attributes == null) {
+      attributes = new HashMap<>();
+    }
+    return attributes.put(key, value);
+  }
+
+  @Override
+  public Map<String, String> getAttributes() {
+    return attributes;
+  }
+
  /** Prints a Field for human consumption. */
  @Override
  public String toString() {
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@ -661,6 +661,11 @@ final class DefaultIndexingChain extends DocConsumer {

      FieldInfo fi = fieldInfos.getOrAdd(name);
      initIndexOptions(fi, fieldType.indexOptions());
+      Map<String, String> attributes = fieldType.getAttributes();
+      if (attributes != null) {
+        attributes.forEach((k, v) -> fi.putAttribute(k, v));
+      }
+
      fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert);
      fp.next = fieldHash[hashPos];
      fieldHash[hashPos] = fp;
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java
@ -142,7 +142,7 @@ public final class FieldInfo {

  // should only be called by FieldInfos#addOrUpdate
  void update(boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions,
-              int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes) {
+              Map<String, String> attributes, int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes) {
    if (indexOptions == null) {
      throw new NullPointerException("IndexOptions must not be null (field: \"" + name + "\")");
    }
@ -176,6 +176,9 @@ public final class FieldInfo {
      // cannot store payloads if we don't store positions:
      this.storePayloads = false;
    }
+    if (attributes != null) {
+      this.attributes.putAll(attributes);
+    }
    assert checkConsistency();
  }

@ -346,8 +349,9 @@ public final class FieldInfo {
   * to store additional metadata, and will be available to the codec
   * when reading the segment via {@link #getAttribute(String)}
   * <p>
-   * If a value already exists for the field, it will be replaced with 
-   * the new value.
+   * If a value already exists for the key in the field, it will be replaced with
+   * the new value. If the value of the attributes for a same field is changed between
+   * the documents, the behaviour after merge is undefined.
   */
  public String putAttribute(String key, String value) {
    return attributes.put(key, value);
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
@ -506,12 +506,18 @@ public class FieldInfos implements Iterable<FieldInfo> {
                                          boolean storeTermVector,
                                          boolean omitNorms, boolean storePayloads, IndexOptions indexOptions,
                                          DocValuesType docValues, long dvGen,
+                                          Map<String, String> attributes,
                                          int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes,
                                          boolean isSoftDeletesField) {
      assert assertNotFinished();
      if (docValues == null) {
        throw new NullPointerException("DocValuesType must not be null");
      }
+      if (attributes != null) {
+        // original attributes is UnmodifiableMap
+        attributes = new HashMap<>(attributes);
+      }
+
      FieldInfo fi = fieldInfo(name);
      if (fi == null) {
        // This field wasn't yet added to this in-RAM
@ -520,12 +526,12 @@ public class FieldInfos implements Iterable<FieldInfo> {
        // before then we'll get the same name and number,
        // else we'll allocate a new one:
        final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
-        fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, new HashMap<>(), dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
+        fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
        assert !byName.containsKey(fi.name);
        globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType());
        byName.put(fi.name, fi);
      } else {
-        fi.update(storeTermVector, omitNorms, storePayloads, indexOptions, dataDimensionCount, indexDimensionCount, dimensionNumBytes);
+        fi.update(storeTermVector, omitNorms, storePayloads, indexOptions, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes);

        if (docValues != DocValuesType.NONE) {
          // Only pay the synchronization cost if fi does not already have a DVType
@ -553,6 +559,7 @@ public class FieldInfos implements Iterable<FieldInfo> {
      return addOrUpdateInternal(fi.name, fi.number, fi.hasVectors(),
                                 fi.omitsNorms(), fi.hasPayloads(),
                                 fi.getIndexOptions(), fi.getDocValuesType(), dvGen,
+                                 fi.attributes(),
                                 fi.getPointDataDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
                                 fi.isSoftDeletesField());
    }
--- a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java
@ -17,6 +17,8 @@
 package org.apache.lucene.index;


+import java.util.Map;
+
 import org.apache.lucene.analysis.Analyzer; // javadocs

 /** 
@ -111,4 +113,14 @@ public interface IndexableFieldType {
   * The number of bytes in each dimension's values.
   */
  public int pointNumBytes();
+
+  /**
+   * Attributes for the field type.
+   *
+   * Attributes are not thread-safe, user must not add attributes while other threads are indexing documents
+   * with this field type.
+   *
+   * @return Map
+   */
+  public Map<String, String> getAttributes();
 }
--- a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java
@ -23,6 +23,7 @@ import java.util.Iterator;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
@ -93,6 +94,57 @@ public class TestFieldInfos extends LuceneTestCase {
    dir.close();
  }

+  public void testFieldAttributes() throws Exception{
+    Directory dir = newDirectory();
+    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))
+        .setMergePolicy(NoMergePolicy.INSTANCE));
+
+    FieldType type1 = new FieldType();
+    type1.setStored(true);
+    type1.putAttribute("testKey1", "testValue1");
+
+    Document d1 = new Document();
+    d1.add(new Field("f1", "v1", type1));
+    FieldType type2 = new FieldType(type1);
+    //changing the value after copying shouldn't impact the original type1
+    type2.putAttribute("testKey1", "testValue2");
+    writer.addDocument(d1);
+    writer.commit();
+
+    Document d2 = new Document();
+    type1.putAttribute("testKey1", "testValueX");
+    type1.putAttribute("testKey2", "testValue2");
+    d2.add(new Field("f1", "v2", type1));
+    d2.add(new Field("f2", "v2", type2));
+    writer.addDocument(d2);
+    writer.commit();
+    writer.forceMerge(1);
+
+    IndexReader reader = writer.getReader();
+    FieldInfos fis = FieldInfos.getMergedFieldInfos(reader);
+    assertEquals(fis.size(), 2);
+    Iterator<FieldInfo>  it = fis.iterator();
+    while(it.hasNext()) {
+      FieldInfo fi = it.next();
+      switch (fi.name) {
+        case "f1":
+          // testKey1 can point to either testValue1 or testValueX based on the order
+          // of merge, but we see textValueX winning here since segment_2 is merged on segment_1.
+          assertEquals("testValueX", fi.getAttribute("testKey1"));
+          assertEquals("testValue2", fi.getAttribute("testKey2"));
+          break;
+        case "f2":
+          assertEquals("testValue2", fi.getAttribute("testKey1"));
+          break;
+        default:
+          assertFalse("Unknown field", true);
+      }
+    }
+    reader.close();
+    writer.close();
+    dir.close();
+  }
+
  public void testMergedFieldInfos_empty() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
@ -21,6 +21,7 @@ import java.io.Reader;
 import java.io.StringReader;
 import java.util.Collections;
 import java.util.Iterator;
+import java.util.Map;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
@ -104,6 +105,11 @@ public class TestIndexableField extends LuceneTestCase {
      public int pointNumBytes() {
        return 0;
      }
+
+      @Override
+      public Map<String, String> getAttributes() {
+        return null;
+      }
    };

    public MyField(int counter) {
--- a/solr/core/src/java/org/apache/solr/schema/SchemaField.java
+++ b/solr/core/src/java/org/apache/solr/schema/SchemaField.java
@ -430,4 +430,9 @@ public final class SchemaField extends FieldProperties implements IndexableField
  public int pointNumBytes() {
    return 0;
  }
+
+  @Override
+  public Map<String, String> getAttributes() {
+    return null;
+  }
 }