SOLR-10046: Add UninvertDocValuesMergePolicyFactory class. (Keith Laban, Christine Poerschke)

2017-03-15 10:31:10 +00:00 · 2017-03-15 10:31:10 +00:00 · 9d56f13650
parent 65c695b025
commit 9d56f13650
5 changed files with 502 additions and 0 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -185,6 +185,8 @@ New Features
 * SOLR-10224: Add disk total and disk free metrics. (ab)
 * SOLR-10046: Add UninvertDocValuesMergePolicyFactory class. (Keith Laban, Christine Poerschke)
 Bug Fixes
 ----------------------
--- a/solr/core/src/java/org/apache/solr/index/UninvertDocValuesMergePolicyFactory.java
+++ b/solr/core/src/java/org/apache/solr/index/UninvertDocValuesMergePolicyFactory.java
@ -0,0 +1,218 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.index;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.CodecReader;
 import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.FilterCodecReader;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.MergePolicy;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.OneMergeWrappingMergePolicy;
 import org.apache.lucene.index.SegmentCommitInfo;
 import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.solr.core.SolrResourceLoader;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.uninverting.UninvertingReader;
 /**
 * A merge policy that can detect schema changes and  write docvalues into merging segments when a field has docvalues enabled
 * Using UninvertingReader.
 * 
 * This merge policy will delegate to the wrapped merge policy for selecting merge segments
 * 
 */
 public class UninvertDocValuesMergePolicyFactory extends WrapperMergePolicyFactory {
  final private boolean skipIntegrityCheck;
  /**
   * Whether or not the wrapped docValues producer should check consistency 
   */
  public boolean getSkipIntegrityCheck() {
    return skipIntegrityCheck;
  }
  public UninvertDocValuesMergePolicyFactory(SolrResourceLoader resourceLoader, MergePolicyFactoryArgs args, IndexSchema schema) {
    super(resourceLoader, args, schema);
    final Boolean sic = (Boolean)args.remove("skipIntegrityCheck");
    if (sic != null) {
      this.skipIntegrityCheck = sic.booleanValue();
    } else {
      this.skipIntegrityCheck = false;
    }
    if (!args.keys().isEmpty()) {
      throw new IllegalArgumentException("Arguments were "+args+" but "+getClass().getSimpleName()+" takes no arguments.");
    }
  }
  @Override
  protected MergePolicy getMergePolicyInstance(MergePolicy wrappedMP) {
    return new OneMergeWrappingMergePolicy(wrappedMP, (merge) -> new UninvertDocValuesOneMerge(merge.segments));
  }
  private UninvertingReader.Type getUninversionType(FieldInfo fi) {
    SchemaField sf = schema.getFieldOrNull(fi.name);
    if (null != sf &&
        sf.hasDocValues() &&
        fi.getDocValuesType() == DocValuesType.NONE &&
        fi.getIndexOptions() != IndexOptions.NONE) {
      return sf.getType().getUninversionType(sf);
    } else {
      return null;
    }
  }
  private class UninvertDocValuesOneMerge extends MergePolicy.OneMerge {
    public UninvertDocValuesOneMerge(List<SegmentCommitInfo> segments) {
      super(segments);
    }
    @Override
    public CodecReader wrapForMerge(CodecReader reader) throws IOException {
      // Wrap the reader with an uninverting reader if any of the fields have no docvalues but the 
      // Schema says there should be
      Map<String,UninvertingReader.Type> uninversionMap = null;
      for(FieldInfo fi: reader.getFieldInfos()) {
        final UninvertingReader.Type type = getUninversionType(fi);
        if (type != null) {
          if (uninversionMap == null) {
            uninversionMap = new HashMap<>();
          }
          uninversionMap.put(fi.name, type);
        }
      }
      if(uninversionMap == null) {
        return reader; // Default to normal reader if nothing to uninvert
      } else {
        return new UninvertingFilterCodecReader(reader, uninversionMap);
      }
    }
  }
  /**
   * Delegates to an Uninverting for fields with docvalues
   * 
   * This is going to blow up FieldCache, look into an alternative implementation that uninverts without
   * fieldcache
   */
  private class UninvertingFilterCodecReader extends FilterCodecReader {
    private final UninvertingReader uninvertingReader;
    private final DocValuesProducer docValuesProducer;
    public UninvertingFilterCodecReader(CodecReader in, Map<String,UninvertingReader.Type> uninversionMap) {
      super(in);
      this.uninvertingReader = new UninvertingReader(in, uninversionMap);
      this.docValuesProducer = new DocValuesProducer() {
        @Override
        public NumericDocValues getNumeric(FieldInfo field) throws IOException {
          return uninvertingReader.getNumericDocValues(field.name);
        }
        @Override
        public BinaryDocValues getBinary(FieldInfo field) throws IOException {
          return uninvertingReader.getBinaryDocValues(field.name);
        }
        @Override
        public SortedDocValues getSorted(FieldInfo field) throws IOException {
          return uninvertingReader.getSortedDocValues(field.name);
        }
        @Override
        public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
          return uninvertingReader.getSortedNumericDocValues(field.name);
        }
        @Override
        public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
          return uninvertingReader.getSortedSetDocValues(field.name);
        }
        @Override
        public void checkIntegrity() throws IOException {
          if (!skipIntegrityCheck) {
            uninvertingReader.checkIntegrity();
          }
        }
        @Override
        public void close() throws IOException {
        }
        @Override
        public long ramBytesUsed() {
          return 0;
        }
      };
    }
    @Override
    protected void doClose() throws IOException {
      docValuesProducer.close();
      uninvertingReader.close();
      super.doClose();
    }
    @Override
    public DocValuesProducer getDocValuesReader() {
      return docValuesProducer;
    }
    @Override
    public FieldInfos getFieldInfos() {
      return uninvertingReader.getFieldInfos();
    }
    @Override
    public CacheHelper getCoreCacheHelper() {
      return in.getCoreCacheHelper();
    }
    @Override
    public CacheHelper getReaderCacheHelper() {
      return in.getReaderCacheHelper();
    }
  }
 }
--- a/solr/core/src/test-files/solr/collection1/conf/schema-docValues.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-docValues.xml
@ -62,6 +62,7 @@
  <field name="datedv" type="date" indexed="false" stored="false" docValues="true" default="1995-12-31T23:59:59.999Z"/>
  <field name="stringdv" type="string" indexed="false" stored="false" docValues="true" default="solr" />
  <field name="string_add_dv_later" type="string" indexed="true" stored="true" docValues="false"/>
  <field name="booldv" type="boolean" indexed="false" stored="false" docValues="true" default="true" />
  <field name="floatdvs" type="float" indexed="false" stored="false" docValues="true" default="1"/>
--- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-uninvertdocvaluesmergepolicyfactory.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-uninvertdocvaluesmergepolicyfactory.xml
@ -0,0 +1,38 @@
 <?xml version="1.0" ?>
 <!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 <config>
  <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
  <schemaFactory class="ClassicIndexSchemaFactory"/>
  <indexConfig>
    <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
    <mergePolicyFactory class="org.apache.solr.index.UninvertDocValuesMergePolicyFactory">
      <str name="wrapped.prefix">inner</str>
      <str name="inner.class">org.apache.solr.index.DefaultMergePolicyFactory</str>
      <bool name="skipIntegrityCheck">${solr.tests.skipIntegrityCheck:false}</bool>
    </mergePolicyFactory>
    <mergeScheduler class="org.apache.lucene.index.ConcurrentMergeScheduler"/>
  </indexConfig>
  <requestHandler name="standard" class="solr.StandardRequestHandler"></requestHandler>
 </config>
--- a/solr/core/src/test/org/apache/solr/index/UninvertDocValuesMergePolicyTest.java
+++ b/solr/core/src/test/org/apache/solr/index/UninvertDocValuesMergePolicyTest.java
@ -0,0 +1,243 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.index;
 import java.util.Random;
 import java.util.function.IntUnaryOperator;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.DocValuesType;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.SortedDocValues;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.util.RefCounted;
 import org.apache.solr.util.TestHarness;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
 public class UninvertDocValuesMergePolicyTest extends SolrTestCaseJ4 {
  private static String SOLR_TESTS_SKIP_INTEGRITY_CHECK = "solr.tests.skipIntegrityCheck";
  private static String ID_FIELD = "id";
  private static String TEST_FIELD = "string_add_dv_later";
  @BeforeClass
  public static void beforeTests() throws Exception {
    System.setProperty(SOLR_TESTS_SKIP_INTEGRITY_CHECK, (random().nextBoolean() ? "true" : "false"));
  }
  @AfterClass
  public static void afterTests() {
    System.clearProperty(SOLR_TESTS_SKIP_INTEGRITY_CHECK);
  }
  @After
  public void after() throws Exception {
    deleteCore();
  }
  @Before
  public void before() throws Exception {
    initCore("solrconfig-uninvertdocvaluesmergepolicyfactory.xml", "schema-docValues.xml");
  }
  public void testIndexAndAddDocValues() throws Exception {
    Random rand = random();
    for(int i=0; i < 100; i++) {
      assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i)));
      if(rand.nextBoolean()) {
        assertU(commit());
      }
    }
    assertU(commit());
    // Assert everything has been indexed and there are no docvalues
    withNewRawReader(h, topReader -> {
      assertEquals(100, topReader.numDocs());
      final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
      // The global field type should not have docValues yet
      assertEquals(DocValuesType.NONE, infos.fieldInfo(TEST_FIELD).getDocValuesType());
    });
    addDocValuesTo(h, TEST_FIELD);
    // Add some more documents with doc values turned on including updating some
    for(int i=90; i < 110; i++) {
      assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i)));
      if(rand.nextBoolean()) {
        assertU(commit());
      }
    }
    assertU(commit());
    withNewRawReader(h, topReader -> {
      assertEquals(110, topReader.numDocs());
      final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
      // The global field type should have docValues because a document with dvs was added
      assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());
    });
    int optimizeSegments = 1;
    assertU(optimize("maxSegments", String.valueOf(optimizeSegments)));
    // Assert all docs have the right docvalues
    withNewRawReader(h, topReader -> {
      // Assert merged into one segment 
      assertEquals(110, topReader.numDocs());
      assertEquals(optimizeSegments, topReader.leaves().size());
      final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
      // The global field type should have docValues because a document with dvs was added
      assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());
      // Check that all segments have the right docvalues type with the correct value
      // Also check that other fields (e.g. the id field) didn't mistakenly get docvalues added
      for (LeafReaderContext ctx : topReader.leaves()) {
        LeafReader r = ctx.reader();
        SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD);
        for(int i = 0; i < r.numDocs(); ++i) {
          Document doc = r.document(i);
          String v = doc.getField(TEST_FIELD).stringValue();
          String id = doc.getField(ID_FIELD).stringValue();
          assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType());
          assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType());
          assertEquals(v, id);
          docvalues.nextDoc();
          assertEquals(v, docvalues.binaryValue().utf8ToString());
        }
      }
    });
  }
  // When an non-indexed field gets merged, it exhibit the old behavior
  // The field will be merged, docvalues headers updated, but no docvalues for this field
  public void testNonIndexedFieldDoesNonFail() throws Exception {
    // Remove Indexed from fieldType
    removeIndexFrom(h, TEST_FIELD);
    assertU(adoc(ID_FIELD, String.valueOf(1), TEST_FIELD, String.valueOf(1)));
    assertU(commit());
    addDocValuesTo(h, TEST_FIELD);
    assertU(adoc(ID_FIELD, String.valueOf(2), TEST_FIELD, String.valueOf(2)));
    assertU(commit());
    assertU(optimize("maxSegments", "1"));
    withNewRawReader(h, topReader -> {
      // Assert merged into one segment 
      assertEquals(2, topReader.numDocs());
      assertEquals(1, topReader.leaves().size());
      final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
      // The global field type should have docValues because a document with dvs was added
      assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());
      for (LeafReaderContext ctx : topReader.leaves()) {
        LeafReader r = ctx.reader();
        SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD);
        for(int i = 0; i < r.numDocs(); ++i) {
          Document doc = r.document(i);
          String v = doc.getField(TEST_FIELD).stringValue();
          String id = doc.getField(ID_FIELD).stringValue();
          assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType());
          assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType());
          if(id.equals("2")) {
            assertTrue(docvalues.advanceExact(i));
            assertEquals(v, docvalues.binaryValue().utf8ToString());
          } else {
            assertFalse(docvalues.advanceExact(i));
          }
        }
      }  
    });
  }
  private static void addDocValuesTo(TestHarness h, String fieldName) {
    implUpdateSchemaField(h, fieldName, (p) -> (p | 0x00008000)); // FieldProperties.DOC_VALUES
  }
  private static void removeIndexFrom(TestHarness h, String fieldName) {
    implUpdateSchemaField(h, fieldName, (p) -> (p ^ 0x00000001)); // FieldProperties.INDEXED
  }
  private static void implUpdateSchemaField(TestHarness h, String fieldName, IntUnaryOperator propertiesModifier) {
    try (SolrCore core = h.getCoreInc()) {
      // Add docvalues to the field type
      IndexSchema schema = core.getLatestSchema();
      SchemaField oldSchemaField = schema.getField(fieldName);
      SchemaField newSchemaField = new SchemaField(
          fieldName,
          oldSchemaField.getType(),
          propertiesModifier.applyAsInt(oldSchemaField.getProperties()),
          oldSchemaField.getDefaultValue());
      schema.getFields().put(fieldName, newSchemaField);
    }
  }
  private interface DirectoryReaderConsumer {
    public void accept(DirectoryReader consumer) throws Exception;
  }
  private static void withNewRawReader(TestHarness h, DirectoryReaderConsumer consumer) {
    try (SolrCore core = h.getCoreInc()) {
      final RefCounted<SolrIndexSearcher> searcherRef = core.openNewSearcher(true, true);
      final SolrIndexSearcher searcher = searcherRef.get();
      try {
        try {
          consumer.accept(searcher.getRawReader());
        } catch (Exception e) {
          fail(e.toString());
        }
      } finally {
        searcherRef.decref();
      }
    }
  }
 }