SOLR-10046: Add UninvertDocValuesMergePolicyFactory class. (Keith Laban, Christine Poerschke)

2017-03-15 10:31:10 +00:00 · 2017-03-15 10:31:10 +00:00 · 9d56f13650
parent 65c695b025
commit 9d56f13650
5 changed files with 502 additions and 0 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -185,6 +185,8 @@ New Features

 * SOLR-10224: Add disk total and disk free metrics. (ab)

+* SOLR-10046: Add UninvertDocValuesMergePolicyFactory class. (Keith Laban, Christine Poerschke)
+
 Bug Fixes
 ----------------------

--- a/solr/core/src/java/org/apache/solr/index/UninvertDocValuesMergePolicyFactory.java
+++ b/solr/core/src/java/org/apache/solr/index/UninvertDocValuesMergePolicyFactory.java
@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.index;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.codecs.DocValuesProducer;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CodecReader;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.FilterCodecReader;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.MergePolicy;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.OneMergeWrappingMergePolicy;
+import org.apache.lucene.index.SegmentCommitInfo;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.uninverting.UninvertingReader;
+
+/**
+ * A merge policy that can detect schema changes and  write docvalues into merging segments when a field has docvalues enabled
+ * Using UninvertingReader.
+ * 
+ * This merge policy will delegate to the wrapped merge policy for selecting merge segments
+ * 
+ */
+public class UninvertDocValuesMergePolicyFactory extends WrapperMergePolicyFactory {
+  
+  final private boolean skipIntegrityCheck;
+
+  /**
+   * Whether or not the wrapped docValues producer should check consistency 
+   */
+  public boolean getSkipIntegrityCheck() {
+    return skipIntegrityCheck;
+  }
+
+  public UninvertDocValuesMergePolicyFactory(SolrResourceLoader resourceLoader, MergePolicyFactoryArgs args, IndexSchema schema) {
+    super(resourceLoader, args, schema);
+    final Boolean sic = (Boolean)args.remove("skipIntegrityCheck");
+    if (sic != null) {
+      this.skipIntegrityCheck = sic.booleanValue();
+    } else {
+      this.skipIntegrityCheck = false;
+    }
+    if (!args.keys().isEmpty()) {
+      throw new IllegalArgumentException("Arguments were "+args+" but "+getClass().getSimpleName()+" takes no arguments.");
+    }
+  }
+
+  @Override
+  protected MergePolicy getMergePolicyInstance(MergePolicy wrappedMP) {
+    return new OneMergeWrappingMergePolicy(wrappedMP, (merge) -> new UninvertDocValuesOneMerge(merge.segments));
+  }
+  
+  private UninvertingReader.Type getUninversionType(FieldInfo fi) {
+    SchemaField sf = schema.getFieldOrNull(fi.name);
+    
+    if (null != sf &&
+        sf.hasDocValues() &&
+        fi.getDocValuesType() == DocValuesType.NONE &&
+        fi.getIndexOptions() != IndexOptions.NONE) {
+      return sf.getType().getUninversionType(sf);
+    } else {
+      return null;
+    }
+  }
+    
+  private class UninvertDocValuesOneMerge extends MergePolicy.OneMerge {
+
+    public UninvertDocValuesOneMerge(List<SegmentCommitInfo> segments) {
+      super(segments);
+    }
+    
+    @Override
+    public CodecReader wrapForMerge(CodecReader reader) throws IOException {
+      // Wrap the reader with an uninverting reader if any of the fields have no docvalues but the 
+      // Schema says there should be
+      
+      
+      Map<String,UninvertingReader.Type> uninversionMap = null;
+      
+      for(FieldInfo fi: reader.getFieldInfos()) {
+        final UninvertingReader.Type type = getUninversionType(fi);
+        if (type != null) {
+          if (uninversionMap == null) {
+            uninversionMap = new HashMap<>();
+          }
+          uninversionMap.put(fi.name, type);
+        }
+        
+      }
+      
+      if(uninversionMap == null) {
+        return reader; // Default to normal reader if nothing to uninvert
+      } else {
+        return new UninvertingFilterCodecReader(reader, uninversionMap);
+      }
+      
+    }
+    
+  }
+  
+  
+  /**
+   * Delegates to an Uninverting for fields with docvalues
+   * 
+   * This is going to blow up FieldCache, look into an alternative implementation that uninverts without
+   * fieldcache
+   */
+  private class UninvertingFilterCodecReader extends FilterCodecReader {
+
+    private final UninvertingReader uninvertingReader;
+    private final DocValuesProducer docValuesProducer;
+
+    public UninvertingFilterCodecReader(CodecReader in, Map<String,UninvertingReader.Type> uninversionMap) {
+      super(in);
+
+      this.uninvertingReader = new UninvertingReader(in, uninversionMap);
+      this.docValuesProducer = new DocValuesProducer() {
+
+        @Override
+        public NumericDocValues getNumeric(FieldInfo field) throws IOException {
+          return uninvertingReader.getNumericDocValues(field.name);
+        }
+
+        @Override
+        public BinaryDocValues getBinary(FieldInfo field) throws IOException {
+          return uninvertingReader.getBinaryDocValues(field.name);
+        }
+
+        @Override
+        public SortedDocValues getSorted(FieldInfo field) throws IOException {
+          return uninvertingReader.getSortedDocValues(field.name);
+        }
+
+        @Override
+        public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
+          return uninvertingReader.getSortedNumericDocValues(field.name);
+        }
+
+        @Override
+        public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
+          return uninvertingReader.getSortedSetDocValues(field.name);
+        }
+
+        @Override
+        public void checkIntegrity() throws IOException {
+          if (!skipIntegrityCheck) {
+            uninvertingReader.checkIntegrity();
+          }
+        }
+
+        @Override
+        public void close() throws IOException {
+        }
+
+        @Override
+        public long ramBytesUsed() {
+          return 0;
+        }
+      };
+    }
+    
+    @Override
+    protected void doClose() throws IOException {
+      docValuesProducer.close();
+      uninvertingReader.close();
+      super.doClose();
+    }
+
+    @Override
+    public DocValuesProducer getDocValuesReader() {
+      return docValuesProducer;
+    }
+    
+    @Override
+    public FieldInfos getFieldInfos() {
+      return uninvertingReader.getFieldInfos();
+    }
+
+    @Override
+    public CacheHelper getCoreCacheHelper() {
+      return in.getCoreCacheHelper();
+    }
+
+    @Override
+    public CacheHelper getReaderCacheHelper() {
+      return in.getReaderCacheHelper();
+    }
+    
+  }
+
+}
--- a/solr/core/src/test-files/solr/collection1/conf/schema-docValues.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-docValues.xml
@ -62,6 +62,7 @@
  <field name="datedv" type="date" indexed="false" stored="false" docValues="true" default="1995-12-31T23:59:59.999Z"/>

  <field name="stringdv" type="string" indexed="false" stored="false" docValues="true" default="solr" />
+  <field name="string_add_dv_later" type="string" indexed="true" stored="true" docValues="false"/>
  <field name="booldv" type="boolean" indexed="false" stored="false" docValues="true" default="true" />

  <field name="floatdvs" type="float" indexed="false" stored="false" docValues="true" default="1"/>
--- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-uninvertdocvaluesmergepolicyfactory.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-uninvertdocvaluesmergepolicyfactory.xml
@ -0,0 +1,38 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<config>
+  <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
+  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
+  <schemaFactory class="ClassicIndexSchemaFactory"/>
+
+  <indexConfig>
+    <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
+    <mergePolicyFactory class="org.apache.solr.index.UninvertDocValuesMergePolicyFactory">
+      <str name="wrapped.prefix">inner</str>
+      <str name="inner.class">org.apache.solr.index.DefaultMergePolicyFactory</str>
+      <bool name="skipIntegrityCheck">${solr.tests.skipIntegrityCheck:false}</bool>
+    </mergePolicyFactory>
+     
+    <mergeScheduler class="org.apache.lucene.index.ConcurrentMergeScheduler"/>
+  </indexConfig>
+
+  <requestHandler name="standard" class="solr.StandardRequestHandler"></requestHandler>
+
+</config>
--- a/solr/core/src/test/org/apache/solr/index/UninvertDocValuesMergePolicyTest.java
+++ b/solr/core/src/test/org/apache/solr/index/UninvertDocValuesMergePolicyTest.java
@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.index;
+
+import java.util.Random;
+import java.util.function.IntUnaryOperator;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RefCounted;
+import org.apache.solr.util.TestHarness;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+
+public class UninvertDocValuesMergePolicyTest extends SolrTestCaseJ4 {
+
+  private static String SOLR_TESTS_SKIP_INTEGRITY_CHECK = "solr.tests.skipIntegrityCheck";
+  private static String ID_FIELD = "id";
+  private static String TEST_FIELD = "string_add_dv_later";
+
+  @BeforeClass
+  public static void beforeTests() throws Exception {
+    System.setProperty(SOLR_TESTS_SKIP_INTEGRITY_CHECK, (random().nextBoolean() ? "true" : "false"));
+  }
+
+  @AfterClass
+  public static void afterTests() {
+    System.clearProperty(SOLR_TESTS_SKIP_INTEGRITY_CHECK);
+  }
+
+  @After
+  public void after() throws Exception {
+    deleteCore();
+  }
+  
+  @Before
+  public void before() throws Exception {
+    initCore("solrconfig-uninvertdocvaluesmergepolicyfactory.xml", "schema-docValues.xml");
+  }
+
+  public void testIndexAndAddDocValues() throws Exception {
+    Random rand = random();
+    
+    for(int i=0; i < 100; i++) {
+      assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i)));
+      
+      if(rand.nextBoolean()) {
+        assertU(commit());
+      }
+    }
+    
+    assertU(commit());
+    
+    // Assert everything has been indexed and there are no docvalues
+    withNewRawReader(h, topReader -> {
+      assertEquals(100, topReader.numDocs());
+
+      final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
+
+      // The global field type should not have docValues yet
+      assertEquals(DocValuesType.NONE, infos.fieldInfo(TEST_FIELD).getDocValuesType());
+    });
+    
+    
+    addDocValuesTo(h, TEST_FIELD);
+    
+    
+    // Add some more documents with doc values turned on including updating some
+    for(int i=90; i < 110; i++) {
+      assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i)));
+      
+      if(rand.nextBoolean()) {
+        assertU(commit());
+      }
+    }
+    
+    assertU(commit());
+    
+    withNewRawReader(h, topReader -> {
+      assertEquals(110, topReader.numDocs());
+
+      final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
+      // The global field type should have docValues because a document with dvs was added
+      assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());
+    });
+    
+    int optimizeSegments = 1;
+    assertU(optimize("maxSegments", String.valueOf(optimizeSegments)));
+    
+    
+    // Assert all docs have the right docvalues
+    withNewRawReader(h, topReader -> {
+      // Assert merged into one segment 
+      assertEquals(110, topReader.numDocs());
+      assertEquals(optimizeSegments, topReader.leaves().size());
+      
+
+      final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
+      // The global field type should have docValues because a document with dvs was added
+      assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());
+      
+      
+      // Check that all segments have the right docvalues type with the correct value
+      // Also check that other fields (e.g. the id field) didn't mistakenly get docvalues added
+      for (LeafReaderContext ctx : topReader.leaves()) {
+        LeafReader r = ctx.reader();
+        SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD);
+        for(int i = 0; i < r.numDocs(); ++i) {
+          Document doc = r.document(i);
+          String v = doc.getField(TEST_FIELD).stringValue();
+          String id = doc.getField(ID_FIELD).stringValue();
+          assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType());
+          assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType());
+          assertEquals(v, id);
+          
+          docvalues.nextDoc();
+          assertEquals(v, docvalues.binaryValue().utf8ToString());
+        }
+      }
+    });
+  }
+  
+  
+  // When an non-indexed field gets merged, it exhibit the old behavior
+  // The field will be merged, docvalues headers updated, but no docvalues for this field
+  public void testNonIndexedFieldDoesNonFail() throws Exception {
+    // Remove Indexed from fieldType
+    removeIndexFrom(h, TEST_FIELD);
+    
+    assertU(adoc(ID_FIELD, String.valueOf(1), TEST_FIELD, String.valueOf(1)));
+    assertU(commit());
+    
+    addDocValuesTo(h, TEST_FIELD);
+    
+    assertU(adoc(ID_FIELD, String.valueOf(2), TEST_FIELD, String.valueOf(2)));
+    assertU(commit());
+    
+    assertU(optimize("maxSegments", "1"));
+    
+    withNewRawReader(h, topReader -> {
+      // Assert merged into one segment 
+      assertEquals(2, topReader.numDocs());
+      assertEquals(1, topReader.leaves().size());
+      
+
+      final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
+      // The global field type should have docValues because a document with dvs was added
+      assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());
+      
+      for (LeafReaderContext ctx : topReader.leaves()) {
+        LeafReader r = ctx.reader();
+        SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD);
+        for(int i = 0; i < r.numDocs(); ++i) {
+          Document doc = r.document(i);
+          String v = doc.getField(TEST_FIELD).stringValue();
+          String id = doc.getField(ID_FIELD).stringValue();
+          assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType());
+          assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType());
+          
+         
+          if(id.equals("2")) {
+            assertTrue(docvalues.advanceExact(i));
+            assertEquals(v, docvalues.binaryValue().utf8ToString());
+          } else {
+            assertFalse(docvalues.advanceExact(i));
+          }
+          
+        }
+      }  
+    });
+  }
+
+  
+  private static void addDocValuesTo(TestHarness h, String fieldName) {
+    implUpdateSchemaField(h, fieldName, (p) -> (p | 0x00008000)); // FieldProperties.DOC_VALUES
+  }
+
+  private static void removeIndexFrom(TestHarness h, String fieldName) {
+    implUpdateSchemaField(h, fieldName, (p) -> (p ^ 0x00000001)); // FieldProperties.INDEXED
+  }
+
+  private static void implUpdateSchemaField(TestHarness h, String fieldName, IntUnaryOperator propertiesModifier) {
+    try (SolrCore core = h.getCoreInc()) {
+
+      // Add docvalues to the field type
+      IndexSchema schema = core.getLatestSchema();
+      SchemaField oldSchemaField = schema.getField(fieldName);
+      SchemaField newSchemaField = new SchemaField(
+          fieldName,
+          oldSchemaField.getType(),
+          propertiesModifier.applyAsInt(oldSchemaField.getProperties()),
+          oldSchemaField.getDefaultValue());
+      schema.getFields().put(fieldName, newSchemaField);
+    }
+  }
+  
+  private interface DirectoryReaderConsumer {
+    public void accept(DirectoryReader consumer) throws Exception;
+  }
+
+  private static void withNewRawReader(TestHarness h, DirectoryReaderConsumer consumer) {
+    try (SolrCore core = h.getCoreInc()) {
+      final RefCounted<SolrIndexSearcher> searcherRef = core.openNewSearcher(true, true);
+      final SolrIndexSearcher searcher = searcherRef.get();
+      try {
+        try {
+          consumer.accept(searcher.getRawReader());
+        } catch (Exception e) {
+          fail(e.toString());
+        }
+      } finally {
+        searcherRef.decref();
+      }
+    }
+  }
+}