LUCENE-7891: use a non-buggy LRU cache in Lucene's taxonomy facets, by default

2017-09-05 10:13:58 -04:00 · 2017-09-05 10:13:58 -04:00 · b4a1a1a87b
parent f4b13e86ff
commit b4a1a1a87b
3 changed files with 65 additions and 8 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -52,6 +52,9 @@ Bug Fixes
  not recommended, lucene-analyzers-icu contains binary data structures
  specific to ICU/Unicode versions it is built against. (Chris Koenig, Robert Muir)

+* LUCENE-7891: Lucene's taxonomy facets now uses a non-buggy LRU cache
+  by default.  (Jan-Willem van den Broek via Mike McCandless)
+
 Build

 * SOLR-11181: Switch order of maven artifact publishing procedure: deploy first
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/LruTaxonomyWriterCache.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/LruTaxonomyWriterCache.java
@ -32,8 +32,12 @@ public class LruTaxonomyWriterCache implements TaxonomyWriterCache {
   * function, LRU_STRING should be used.
   */
  public enum LRUType {
-    /** Use the label's hash as the key; this can lead to
-     *  silent conflicts! */
+    /** Use only the label's 64 bit longHashCode as the hash key. Do not
+     *  check equals, unlike most hash maps.
+     *  Note that while these hashes are very likely to be unique, the chance
+     *  of a collision is still greater than zero. If such an unlikely event
+     *  occurs, your document will get an incorrect facet.
+     */
    LRU_HASHED,

    /** Use the label as the hash key; this is always
@ -43,15 +47,15 @@ public class LruTaxonomyWriterCache implements TaxonomyWriterCache {

  private NameIntCacheLRU cache;

-  /** Creates this with {@link LRUType#LRU_HASHED} method. */
+  /** Creates this with {@link LRUType#LRU_STRING} method. */
  public LruTaxonomyWriterCache(int cacheSize) {
    // TODO (Facet): choose between NameHashIntCacheLRU and NameIntCacheLRU.
    // For guaranteed correctness - not relying on no-collisions in the hash
    // function, NameIntCacheLRU should be used:
    // On the other hand, NameHashIntCacheLRU takes less RAM but if there
-    // are collisions (which we never found) two different paths would be
-    // mapped to the same ordinal...
-    this(cacheSize, LRUType.LRU_HASHED);
+    // are collisions two different paths would be mapped to the same
+    // ordinal...
+    this(cacheSize, LRUType.LRU_STRING);
  }

  /** Creates this with the specified method. */
@ -60,8 +64,8 @@ public class LruTaxonomyWriterCache implements TaxonomyWriterCache {
    // For guaranteed correctness - not relying on no-collisions in the hash
    // function, NameIntCacheLRU should be used:
    // On the other hand, NameHashIntCacheLRU takes less RAM but if there
-    // are collisions (which we never found) two different paths would be
-    // mapped to the same ordinal...
+    // are collisions two different paths would be mapped to the same
+    // ordinal...
    if (lruType == LRUType.LRU_HASHED) {
      this.cache = new NameHashIntCacheLRU(cacheSize);
    } else {
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestLruTaxonomyWriterCache.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestLruTaxonomyWriterCache.java
@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.facet.taxonomy.writercache;
+
+import org.apache.lucene.facet.FacetTestCase;
+import org.apache.lucene.facet.taxonomy.FacetLabel;
+import org.junit.Test;
+
+public class TestLruTaxonomyWriterCache extends FacetTestCase {
+
+  @Test
+  public void testDefaultLRUTypeIsCollisionSafe() {
+    // These labels are clearly different, but have identical longHashCodes.
+    // Note that these labels are clearly contrived. We did encounter
+    // collisions in actual production data, but we aren't allowed to publish
+    // those.
+    final FacetLabel a = new FacetLabel("\0", "\u0003\uFFE2");
+    final FacetLabel b = new FacetLabel("\1", "\0");
+    // If this fails, then the longHashCode implementation has changed. This
+    // cannot prevent collisions. (All hashes must allow for collisions.) It
+    // will however stop the rest of this test from making sense. To fix, find
+    // new colliding labels, or make a subclass of FacetLabel that produces
+    // collisions.
+    assertEquals(a.longHashCode(), b.longHashCode());
+    // Make a cache with capacity > 2 so both our labels will fit. Don't
+    // specify an LRUType, since we want to check if the default is
+    // collision-safe.
+    final LruTaxonomyWriterCache cache = new LruTaxonomyWriterCache(10);
+    cache.put(a, 0);
+    cache.put(b, 1);
+    assertEquals(cache.get(a), 0);
+    assertEquals(cache.get(b), 1);
+  }
+
+}