LUCENE-10530: Avoid floating point precision bug in TestTaxonomyFacetAssociations (#848)

2025-03-06 16:29:30 +00:00 · 2022-04-29 08:57:46 -07:00 · 2022-04-29 08:57:46 -07:00 · 902a7df0e5
commit 902a7df0e5
parent 0dad9ddae8
2 changed files with 45 additions and 7 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -80,7 +80,7 @@ New Features
  to speed up computing the number of hits when possible. (Lu Xugang, Luca Cavanna, Adrien Grand)

 * LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory` 
-  implementation. `Monitor` can be created with a readonly `QueryIndex` in order to 
+  implementation. `Monitor` can be created with a readonly `QueryIndex` in order to
  have readonly `Monitor` instances. (Niko Usai)

 * LUCENE-10456: Implement rewrite and Weight#count for MultiRangeQuery
@ -153,8 +153,11 @@ Bug Fixes
  no documents instead of throwing an NPE. (Greg Miller)

 * LUCENE-10470: Check if polygon has been successfully tessellated before we fail (we are failing some valid
-  tessellations) and allow filtering edges that fold on top of the previous one. (Ignacio Vera)  
-  
+  tessellations) and allow filtering edges that fold on top of the previous one. (Ignacio Vera)
+
+* LUCENE-10530: Avoid floating point precision test case bug in TestTaxonomyFacetAssociations.
+  (Greg Miller)
+
 Build
 ---------------------

--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetAssociations.java
@ -31,11 +31,17 @@ import org.apache.lucene.facet.FacetsCollectorManager;
 import org.apache.lucene.facet.FacetsConfig;
 import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
 import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.index.RandomIndexWriter;
+import org.apache.lucene.util.BitUtil;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@ -115,7 +121,6 @@ public class TestTaxonomyFacetAssociations extends FacetTestCase {
          }
          if (random().nextBoolean()) { // maybe index a float association with the dim
            float nextFloat = random().nextFloat() * 10000f;
-            randomFloatValues.computeIfAbsent(path, k -> new ArrayList<>()).add(nextFloat);
            doc.add(new FloatAssociationFacetField(nextFloat, "float_random", path));
          }
        }
@ -129,7 +134,6 @@ public class TestTaxonomyFacetAssociations extends FacetTestCase {
        }
        if (random().nextBoolean()) {
          float nextFloat = random().nextFloat() * 10000f;
-          randomFloatSingleValued.computeIfAbsent(path, k -> new ArrayList<>()).add(nextFloat);
          doc.add(new FloatAssociationFacetField(nextFloat, "float_single_valued", path));
        }
      }
@ -141,6 +145,34 @@ public class TestTaxonomyFacetAssociations extends FacetTestCase {
    reader = writer.getReader();
    writer.close();
    taxoReader = new DirectoryTaxonomyReader(taxoDir);
+
+    // To avoid floating point precision issues, it's useful to keep track of the values in the
+    // exact same order they appear when iterating the doc values in the index. This ensures we
+    // sum them in the same order when determining expected values for tests cases and when the
+    // actual facets implementation sums them. See LUCENE-10530:
+    for (LeafReaderContext ctx : reader.leaves()) {
+      BinaryDocValues dv = DocValues.getBinary(ctx.reader(), "$facets.float");
+      for (int doc = dv.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = dv.nextDoc()) {
+        final BytesRef bytesRef = dv.binaryValue();
+        byte[] bytes = bytesRef.bytes;
+        int end = bytesRef.offset + bytesRef.length;
+        int offset = bytesRef.offset;
+        while (offset < end) {
+          int ord = (int) BitUtil.VH_BE_INT.get(bytes, offset);
+          offset += 4;
+          float value = (float) BitUtil.VH_BE_FLOAT.get(bytes, offset);
+          offset += 4;
+          FacetLabel label = taxoReader.getPath(ord);
+          String dim = label.components[0];
+          String child = label.components[1];
+          if ("float_random".equals(dim)) {
+            randomFloatValues.computeIfAbsent(child, k -> new ArrayList<>()).add(value);
+          } else if ("float_single_valued".equals(dim)) {
+            randomFloatSingleValued.computeIfAbsent(child, k -> new ArrayList<>()).add(value);
+          }
+        }
+      }
+    }
  }

  @AfterClass
@ -449,7 +481,10 @@ public class TestTaxonomyFacetAssociations extends FacetTestCase {
    float aggregatedValue = 0f;
    for (Map.Entry<String, Float> e : expected.entrySet()) {
      float value = e.getValue();
-      assertEquals(value, facets.getSpecificValue(dim, e.getKey()).floatValue(), 1);
+      // We can expect the floats to be exactly equal here since we're ensuring that we sum them
+      // in the same order when determining expected values and when computing facets. See
+      // LUCENE-10530:
+      assertEquals(value, facets.getSpecificValue(dim, e.getKey()).floatValue(), 0f);
      aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value);
    }

@ -465,7 +500,7 @@ public class TestTaxonomyFacetAssociations extends FacetTestCase {
      assertNull(facetResult);
    } else {
      assertEquals(dim, facetResult.dim);
-      assertEquals(aggregatedValue, facetResult.value.floatValue(), 1);
+      assertEquals(aggregatedValue, facetResult.value.floatValue(), 1f);
      assertEquals(expected.size(), facetResult.childCount);
    }
  }