LUCENE-5801: add back OrdinalMappingAtomicReader

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1607582 13f79535-47bb-0310-9956-ffa450edef68
2014-07-03 10:49:39 +00:00 · 2014-07-03 10:49:39 +00:00 · a7eee727cf
parent 3a7ae7ee18
commit a7eee727cf
4 changed files with 341 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -101,6 +101,10 @@ New Features
 * LUCENE-5778: Support hunspell morphological description fields/aliases.
  (Robert Muir)

+* LUCENE-5801: Added (back) OrdinalMappingAtomicReader for merging search
+  indexes that contain category ordinals from separate taxonomy indexes.
+  (Nicola Buso via Shai Erera)
+  
 API Changes

 * LUCENE-5752: Simplified Automaton API to be immutable. (Mike McCandless)
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/OrdinalMappingAtomicReader.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/OrdinalMappingAtomicReader.java
@ -0,0 +1,144 @@
+package org.apache.lucene.facet.taxonomy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.facet.FacetsConfig;
+import org.apache.lucene.facet.taxonomy.OrdinalsReader.OrdinalsSegmentReader;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.OrdinalMap;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.FilterAtomicReader;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+
+/**
+ * A {@link FilterAtomicReader} for updating facets ordinal references,
+ * based on an ordinal map. You should use this code in conjunction with merging
+ * taxonomies - after you merge taxonomies, you receive an {@link OrdinalMap}
+ * which maps the 'old' ordinals to the 'new' ones. You can use that map to
+ * re-map the doc values which contain the facets information (ordinals) either
+ * before or while merging the indexes.
+ * <p>
+ * For re-mapping the ordinals during index merge, do the following:
+ * 
+ * <pre class="prettyprint">
+ * // merge the old taxonomy with the new one.
+ * OrdinalMap map = new MemoryOrdinalMap();
+ * DirectoryTaxonomyWriter.addTaxonomy(srcTaxoDir, map);
+ * int[] ordmap = map.getMap();
+ * 
+ * // Add the index and re-map ordinals on the go
+ * DirectoryReader reader = DirectoryReader.open(oldDir);
+ * IndexWriterConfig conf = new IndexWriterConfig(VER, ANALYZER);
+ * IndexWriter writer = new IndexWriter(newDir, conf);
+ * List&lt;AtomicReaderContext&gt; leaves = reader.leaves();
+ * AtomicReader wrappedLeaves[] = new AtomicReader[leaves.size()];
+ * for (int i = 0; i < leaves.size(); i++) {
+ *   wrappedLeaves[i] = new OrdinalMappingAtomicReader(leaves.get(i).reader(), ordmap);
+ * }
+ * writer.addIndexes(new MultiReader(wrappedLeaves));
+ * writer.commit();
+ * </pre>
+ * 
+ * @lucene.experimental
+ */
+public class OrdinalMappingAtomicReader extends FilterAtomicReader {
+  
+  // silly way, but we need to use dedupAndEncode and it's protected on FacetsConfig.
+  private static class InnerFacetsConfig extends FacetsConfig {
+    
+    InnerFacetsConfig() {}
+    
+    @Override
+    public BytesRef dedupAndEncode(IntsRef ordinals) {
+      return super.dedupAndEncode(ordinals);
+    }
+    
+  }
+  
+  private class OrdinalMappingBinaryDocValues extends BinaryDocValues {
+    
+    private final IntsRef ordinals = new IntsRef(32);
+    private final OrdinalsSegmentReader ordsReader;
+    
+    OrdinalMappingBinaryDocValues(OrdinalsSegmentReader ordsReader) throws IOException {
+      this.ordsReader = ordsReader;
+    }
+    
+    @SuppressWarnings("synthetic-access")
+    @Override
+    public BytesRef get(int docID) {
+      try {
+        // NOTE: this isn't quite koscher, because in general
+        // multiple threads can call BinaryDV.get which would
+        // then conflict on the single ordinals instance, but
+        // because this impl is only used for merging, we know
+        // only 1 thread calls us:
+        ordsReader.get(docID, ordinals);
+        
+        // map the ordinals
+        for (int i = 0; i < ordinals.length; i++) {
+          ordinals.ints[i] = ordinalMap[ordinals.ints[i]];
+        }
+        
+        return encode(ordinals);
+      } catch (IOException e) {
+        throw new RuntimeException("error reading category ordinals for doc " + docID, e);
+      }
+    }
+  }
+  
+  private final int[] ordinalMap;
+  private final InnerFacetsConfig facetsConfig;
+  
+  /**
+   * Wraps an AtomicReader, mapping ordinals according to the ordinalMap,
+   * using the provided indexingParams.
+   */
+  public OrdinalMappingAtomicReader(AtomicReader in, int[] ordinalMap) {
+    super(in);
+    this.ordinalMap = ordinalMap;
+    facetsConfig = new InnerFacetsConfig();
+  }
+  
+  /**
+   * Expert: encodes category ordinals into a BytesRef. Override in case you use
+   * custom encoding, other than the default done by FacetsConfig.
+   */
+  protected BytesRef encode(IntsRef ordinals) {
+    return facetsConfig.dedupAndEncode(ordinals);
+  }
+  
+  /**
+   * Expert: override in case you used custom encoding for the categories under
+   * this field.
+   */
+  protected OrdinalsReader getOrdinalsReader(String field) {
+    return new DocValuesOrdinalsReader(field);
+  }
+  
+  @Override
+  public BinaryDocValues getBinaryDocValues(String field) throws IOException {
+    final OrdinalsReader ordsReader = getOrdinalsReader(field);
+    return new OrdinalMappingBinaryDocValues(ordsReader.getReader(in.getContext()));
+  }
+  
+}
+
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyMergeUtils.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyMergeUtils.java
@ -0,0 +1,66 @@
+package org.apache.lucene.facet.taxonomy;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.OrdinalMap;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.store.Directory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Utility methods for merging index and taxonomy directories.
+ * @lucene.experimental
+ */
+public class TaxonomyMergeUtils {
+  
+  /**
+   * Merges the given taxonomy and index directories and commits the changes to
+   * the given writers.
+   */
+  public static void merge(Directory srcIndexDir, Directory srcTaxDir, OrdinalMap map, IndexWriter destIndexWriter, 
+      DirectoryTaxonomyWriter destTaxWriter) throws IOException {
+    
+    // merge the taxonomies
+    destTaxWriter.addTaxonomy(srcTaxDir, map);
+    int ordinalMap[] = map.getMap();
+    DirectoryReader reader = DirectoryReader.open(srcIndexDir);
+    List<AtomicReaderContext> leaves = reader.leaves();
+    int numReaders = leaves.size();
+    AtomicReader wrappedLeaves[] = new AtomicReader[numReaders];
+    for (int i = 0; i < numReaders; i++) {
+      wrappedLeaves[i] = new OrdinalMappingAtomicReader(leaves.get(i).reader(), ordinalMap);
+    }
+    try {
+      destIndexWriter.addIndexes(new MultiReader(wrappedLeaves));
+      
+      // commit changes to taxonomy and index respectively.
+      destTaxWriter.commit();
+      destIndexWriter.commit();
+    } finally {
+      reader.close();
+    }
+  }
+  
+}
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/utils/OrdinalMappingReaderTest.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/utils/OrdinalMappingReaderTest.java
@ -0,0 +1,127 @@
+package org.apache.lucene.facet.taxonomy.utils;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.facet.FacetField;
+import org.apache.lucene.facet.FacetResult;
+import org.apache.lucene.facet.FacetTestCase;
+import org.apache.lucene.facet.Facets;
+import org.apache.lucene.facet.FacetsCollector;
+import org.apache.lucene.facet.FacetsConfig;
+import org.apache.lucene.facet.LabelAndValue;
+import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts;
+import org.apache.lucene.facet.taxonomy.TaxonomyMergeUtils;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class OrdinalMappingReaderTest extends FacetTestCase {
+  
+  private static final int NUM_DOCS = 100;
+  private FacetsConfig facetConfig = new FacetsConfig();
+  
+  @Before
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    facetConfig.setMultiValued("tag", true);
+  }
+
+  @Test
+  public void testTaxonomyMergeUtils() throws Exception {
+    Directory dir = newDirectory();
+    Directory taxDir = newDirectory();
+    buildIndexWithFacets(dir, taxDir, true);
+    
+    Directory dir1 = newDirectory();
+    Directory taxDir1 = newDirectory();
+    buildIndexWithFacets(dir1, taxDir1, false);
+    
+    IndexWriter destIndexWriter = new IndexWriter(dir1, new IndexWriterConfig(TEST_VERSION_CURRENT, null));
+    DirectoryTaxonomyWriter destTaxWriter = new DirectoryTaxonomyWriter(taxDir1);
+    try {
+      TaxonomyMergeUtils.merge(dir, taxDir, new MemoryOrdinalMap(), destIndexWriter, destTaxWriter);
+    } finally {
+      IOUtils.close(destIndexWriter, destTaxWriter);
+    }
+    
+    verifyResults(dir1, taxDir1);
+    dir1.close();
+    taxDir1.close();
+    dir.close();
+    taxDir.close();
+  }
+  
+  private void verifyResults(Directory dir, Directory taxDir) throws IOException {
+    DirectoryReader reader1 = DirectoryReader.open(dir);
+    DirectoryTaxonomyReader taxReader = new DirectoryTaxonomyReader(taxDir);
+    IndexSearcher searcher = newSearcher(reader1);
+    
+    FacetsCollector collector = new FacetsCollector();
+    FacetsCollector.search(searcher, new MatchAllDocsQuery(), 10, collector);
+    Facets facets = new FastTaxonomyFacetCounts(taxReader, facetConfig, collector);
+    FacetResult result = facets.getTopChildren(10, "tag");
+    
+    for (LabelAndValue lv: result.labelValues) {
+      int weight = lv.value.intValue();
+      String label = lv.label;
+      if (VERBOSE) {
+        System.out.println(label + ": " + weight);
+      }
+      assertEquals(NUM_DOCS ,weight);
+    }
+    reader1.close();
+    taxReader.close();
+  }
+  
+  private void buildIndexWithFacets(Directory dir, Directory taxDir, boolean asc) throws IOException {
+    IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, 
+        new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
+    
+    DirectoryTaxonomyWriter taxonomyWriter = new DirectoryTaxonomyWriter(taxDir);
+    for (int i = 1; i <= NUM_DOCS; i++) {
+      Document doc = new Document();
+      for (int j = i; j <= NUM_DOCS; j++) {
+        int facetValue = asc? j: NUM_DOCS - j;
+        doc.add(new FacetField("tag", Integer.toString(facetValue)));
+      }
+      writer.addDocument(facetConfig.build(taxonomyWriter, doc));
+    }
+    taxonomyWriter.commit();
+    taxonomyWriter.close();
+    writer.commit();
+    writer.close();
+  }
+  
+}