From a0d995d0c4c99ccf00855df05e78290bc8225ed1 Mon Sep 17 00:00:00 2001 From: Gautam Worah Date: Thu, 1 Jul 2021 13:34:00 -0700 Subject: [PATCH] LUCENE-9964: Duplicate long values in a field should only be counted once when using SortedNumericDocValuesFields (#191) --- lucene/CHANGES.txt | 3 + .../lucene/facet/LongValueFacetCounts.java | 16 ++- .../facet/TestLongValueFacetCounts.java | 124 +++++++++++------- 3 files changed, 94 insertions(+), 49 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a635c3e2d0f..3db9d23adb6 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -380,6 +380,9 @@ Bug Fixes --------------------- * LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller) +* LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields + (Gautam Worah) + Other --------------------- (No changes) diff --git a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java index c6acbcb5167..44f2dabd694 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java @@ -162,8 +162,14 @@ public class LongValueFacetCounts extends Facets { if (limit > 0) { totCount++; } + long previousValue = 0; for (int i = 0; i < limit; i++) { - increment(multiValues.nextValue()); + long value = multiValues.nextValue(); + // do not increment the count for duplicate values + if (i == 0 || value != previousValue) { + increment(value); + } + previousValue = value; } } } @@ -208,8 +214,14 @@ public class LongValueFacetCounts extends Facets { if (limit > 0) { totCount++; } + long previousValue = 0; for (int i = 0; i < limit; i++) { - increment(multiValues.nextValue()); + long value = multiValues.nextValue(); + // do not increment the count for duplicate values + if (i == 0 || value != previousValue) { + increment(value); + } + previousValue = value; } } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java index 7fa6dc0a593..aab1c15e6b0 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java @@ -131,11 +131,11 @@ public class TestLongValueFacetCounts extends LuceneTestCase { d.close(); } - public void testRandom() throws Exception { + public void testRandomSingleValued() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); - int valueCount = atLeast(1000); + int docCount = atLeast(1000); double missingChance = random().nextDouble(); long maxValue; if (random().nextBoolean()) { @@ -146,7 +146,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { if (VERBOSE) { System.out.println( "TEST: valueCount=" - + valueCount + + docCount + " valueRange=-" + maxValue + "-" @@ -154,9 +154,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase { + " missingChance=" + missingChance); } - Long[] values = new Long[valueCount]; + Long[] values = new Long[docCount]; int missingCount = 0; - for (int i = 0; i < valueCount; i++) { + for (int i = 0; i < docCount; i++) { Document doc = new Document(); doc.add(new IntPoint("id", i)); if (random().nextDouble() > missingChance) { @@ -185,7 +185,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { // all docs Map expected = new HashMap<>(); int expectedChildCount = 0; - for (int i = 0; i < valueCount; i++) { + for (int i = 0; i < docCount; i++) { if (values[i] != null) { Integer curCount = expected.get(values[i]); if (curCount == null) { @@ -237,7 +237,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { "all docs, sort facets by value", expectedCounts, expectedChildCount, - valueCount - missingCount, + docCount - missingCount, actual, Integer.MAX_VALUE); @@ -253,9 +253,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase { }); int topN; if (random().nextBoolean()) { - topN = valueCount; + topN = docCount; } else { - topN = random().nextInt(valueCount); + topN = random().nextInt(docCount); } if (VERBOSE) { System.out.println(" topN=" + topN); @@ -265,13 +265,13 @@ public class TestLongValueFacetCounts extends LuceneTestCase { "all docs, sort facets by count", expectedCounts, expectedChildCount, - valueCount - missingCount, + docCount - missingCount, actual, topN); // subset of docs - int minId = random().nextInt(valueCount); - int maxId = random().nextInt(valueCount); + int minId = random().nextInt(docCount); + int maxId = random().nextInt(docCount); if (minId > maxId) { int tmp = minId; minId = maxId; @@ -334,9 +334,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase { return cmp; }); if (random().nextBoolean()) { - topN = valueCount; + topN = docCount; } else { - topN = random().nextInt(valueCount); + topN = random().nextInt(docCount); } actual = facetCounts.getTopChildrenSortByCount(topN); assertSame( @@ -355,7 +355,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); - int valueCount = atLeast(1000); + int docCount = atLeast(1000); double missingChance = random().nextDouble(); // sometimes exercise codec optimizations when a claimed multi valued field is in fact single @@ -371,7 +371,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { if (VERBOSE) { System.out.println( "TEST: valueCount=" - + valueCount + + docCount + " valueRange=-" + maxValue + "-" @@ -382,8 +382,8 @@ public class TestLongValueFacetCounts extends LuceneTestCase { + allSingleValued); } - long[][] values = new long[valueCount][]; - for (int i = 0; i < valueCount; i++) { + long[][] values = new long[docCount][]; + for (int i = 0; i < docCount; i++) { Document doc = new Document(); doc.add(new IntPoint("id", i)); if (random().nextDouble() > missingChance) { @@ -403,6 +403,8 @@ public class TestLongValueFacetCounts extends LuceneTestCase { System.out.println(" doc=" + i + " values=" + Arrays.toString(values[i])); } + // sort values to enable duplicate detection by comparing with the previous value + Arrays.sort(values[i]); } else { if (VERBOSE) { System.out.println(" doc=" + i + " missing values"); @@ -426,23 +428,16 @@ public class TestLongValueFacetCounts extends LuceneTestCase { // all docs Map expected = new HashMap<>(); - int expectedChildCount = 0; int expectedTotalCount = 0; - for (int i = 0; i < valueCount; i++) { + for (int i = 0; i < docCount; i++) { if (values[i] != null && values[i].length > 0) { expectedTotalCount++; - for (long value : values[i]) { - Integer curCount = expected.get(value); - if (curCount == null) { - curCount = 0; - expectedChildCount++; - } - expected.put(value, curCount + 1); - } + setExpectedFrequencies(values[i], expected); } } List> expectedCounts = new ArrayList<>(expected.entrySet()); + int expectedChildCount = expected.size(); // sort by value expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey)); @@ -483,9 +478,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase { }); int topN; if (random().nextBoolean()) { - topN = valueCount; + topN = docCount; } else { - topN = random().nextInt(valueCount); + topN = random().nextInt(docCount); } if (VERBOSE) { System.out.println(" topN=" + topN); @@ -500,8 +495,8 @@ public class TestLongValueFacetCounts extends LuceneTestCase { topN); // subset of docs - int minId = random().nextInt(valueCount); - int maxId = random().nextInt(valueCount); + int minId = random().nextInt(docCount); + int maxId = random().nextInt(docCount); if (minId > maxId) { int tmp = minId; minId = maxId; @@ -517,22 +512,15 @@ public class TestLongValueFacetCounts extends LuceneTestCase { facetCounts = new LongValueFacetCounts("field", fc); expected = new HashMap<>(); - expectedChildCount = 0; - int totCount = 0; + expectedTotalCount = 0; for (int i = minId; i <= maxId; i++) { if (values[i] != null && values[i].length > 0) { - totCount++; - for (long value : values[i]) { - Integer curCount = expected.get(value); - if (curCount == null) { - expectedChildCount++; - curCount = 0; - } - expected.put(value, curCount + 1); - } + expectedTotalCount++; + setExpectedFrequencies(values[i], expected); } } expectedCounts = new ArrayList<>(expected.entrySet()); + expectedChildCount = expected.size(); // sort by value expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey)); @@ -541,7 +529,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { "id " + minId + "-" + maxId + ", sort facets by value", expectedCounts, expectedChildCount, - totCount, + expectedTotalCount, actual, Integer.MAX_VALUE); @@ -556,16 +544,16 @@ public class TestLongValueFacetCounts extends LuceneTestCase { return cmp; }); if (random().nextBoolean()) { - topN = valueCount; + topN = docCount; } else { - topN = random().nextInt(valueCount); + topN = random().nextInt(docCount); } actual = facetCounts.getTopChildrenSortByCount(topN); assertSame( "id " + minId + "-" + maxId + ", sort facets by count", expectedCounts, expectedChildCount, - totCount, + expectedTotalCount, actual, topN); } @@ -573,6 +561,17 @@ public class TestLongValueFacetCounts extends LuceneTestCase { dir.close(); } + private void setExpectedFrequencies(long[] values, Map expected) { + long previousValue = 0; + for (int j = 0; j < values.length; j++) { + if (j == 0 || previousValue != values[j]) { + Integer curCount = expected.getOrDefault(values[j], 0); + expected.put(values[j], curCount + 1); + } + previousValue = values[j]; + } + } + private static void assertSame( String desc, List> expectedCounts, @@ -619,4 +618,35 @@ public class TestLongValueFacetCounts extends LuceneTestCase { actual.labelValues[i].value.intValue()); } } + + /** + * LUCENE-9964: Duplicate long values in a document field should only be counted once when using + * SortedNumericDocValuesFields + */ + public void testDuplicateLongValues() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + // these two values are not unique in a document + doc.add(new SortedNumericDocValuesField("field", 42)); + doc.add(new SortedNumericDocValuesField("field", 42)); + w.addDocument(doc); + doc = new Document(); + doc.add(new SortedNumericDocValuesField("field", 43)); + doc.add(new SortedNumericDocValuesField("field", 43)); + w.addDocument(doc); + + IndexReader r = w.getReader(); + w.close(); + LongValueFacetCounts facetCounts = new LongValueFacetCounts("field", r); + + FacetResult fr = facetCounts.getAllChildrenSortByValue(); + for (LabelAndValue labelAndValue : fr.labelValues) { + assert labelAndValue.value.equals(1); + } + + r.close(); + dir.close(); + } }