LUCENE-9964: Duplicate long values in a field should only be counted once when using SortedNumericDocValuesFields (#191)

This commit is contained in:
Gautam Worah 2021-07-01 13:34:00 -07:00 committed by GitHub
parent 834041f286
commit a0d995d0c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 94 additions and 49 deletions

View File

@ -380,6 +380,9 @@ Bug Fixes
---------------------
* LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller)
* LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields
(Gautam Worah)
Other
---------------------
(No changes)

View File

@ -162,8 +162,14 @@ public class LongValueFacetCounts extends Facets {
if (limit > 0) {
totCount++;
}
long previousValue = 0;
for (int i = 0; i < limit; i++) {
increment(multiValues.nextValue());
long value = multiValues.nextValue();
// do not increment the count for duplicate values
if (i == 0 || value != previousValue) {
increment(value);
}
previousValue = value;
}
}
}
@ -208,8 +214,14 @@ public class LongValueFacetCounts extends Facets {
if (limit > 0) {
totCount++;
}
long previousValue = 0;
for (int i = 0; i < limit; i++) {
increment(multiValues.nextValue());
long value = multiValues.nextValue();
// do not increment the count for duplicate values
if (i == 0 || value != previousValue) {
increment(value);
}
previousValue = value;
}
}
}

View File

@ -131,11 +131,11 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
d.close();
}
public void testRandom() throws Exception {
public void testRandomSingleValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int valueCount = atLeast(1000);
int docCount = atLeast(1000);
double missingChance = random().nextDouble();
long maxValue;
if (random().nextBoolean()) {
@ -146,7 +146,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
if (VERBOSE) {
System.out.println(
"TEST: valueCount="
+ valueCount
+ docCount
+ " valueRange=-"
+ maxValue
+ "-"
@ -154,9 +154,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
+ " missingChance="
+ missingChance);
}
Long[] values = new Long[valueCount];
Long[] values = new Long[docCount];
int missingCount = 0;
for (int i = 0; i < valueCount; i++) {
for (int i = 0; i < docCount; i++) {
Document doc = new Document();
doc.add(new IntPoint("id", i));
if (random().nextDouble() > missingChance) {
@ -185,7 +185,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
// all docs
Map<Long, Integer> expected = new HashMap<>();
int expectedChildCount = 0;
for (int i = 0; i < valueCount; i++) {
for (int i = 0; i < docCount; i++) {
if (values[i] != null) {
Integer curCount = expected.get(values[i]);
if (curCount == null) {
@ -237,7 +237,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
"all docs, sort facets by value",
expectedCounts,
expectedChildCount,
valueCount - missingCount,
docCount - missingCount,
actual,
Integer.MAX_VALUE);
@ -253,9 +253,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
});
int topN;
if (random().nextBoolean()) {
topN = valueCount;
topN = docCount;
} else {
topN = random().nextInt(valueCount);
topN = random().nextInt(docCount);
}
if (VERBOSE) {
System.out.println(" topN=" + topN);
@ -265,13 +265,13 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
"all docs, sort facets by count",
expectedCounts,
expectedChildCount,
valueCount - missingCount,
docCount - missingCount,
actual,
topN);
// subset of docs
int minId = random().nextInt(valueCount);
int maxId = random().nextInt(valueCount);
int minId = random().nextInt(docCount);
int maxId = random().nextInt(docCount);
if (minId > maxId) {
int tmp = minId;
minId = maxId;
@ -334,9 +334,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
return cmp;
});
if (random().nextBoolean()) {
topN = valueCount;
topN = docCount;
} else {
topN = random().nextInt(valueCount);
topN = random().nextInt(docCount);
}
actual = facetCounts.getTopChildrenSortByCount(topN);
assertSame(
@ -355,7 +355,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int valueCount = atLeast(1000);
int docCount = atLeast(1000);
double missingChance = random().nextDouble();
// sometimes exercise codec optimizations when a claimed multi valued field is in fact single
@ -371,7 +371,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
if (VERBOSE) {
System.out.println(
"TEST: valueCount="
+ valueCount
+ docCount
+ " valueRange=-"
+ maxValue
+ "-"
@ -382,8 +382,8 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
+ allSingleValued);
}
long[][] values = new long[valueCount][];
for (int i = 0; i < valueCount; i++) {
long[][] values = new long[docCount][];
for (int i = 0; i < docCount; i++) {
Document doc = new Document();
doc.add(new IntPoint("id", i));
if (random().nextDouble() > missingChance) {
@ -403,6 +403,8 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
System.out.println(" doc=" + i + " values=" + Arrays.toString(values[i]));
}
// sort values to enable duplicate detection by comparing with the previous value
Arrays.sort(values[i]);
} else {
if (VERBOSE) {
System.out.println(" doc=" + i + " missing values");
@ -426,23 +428,16 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
// all docs
Map<Long, Integer> expected = new HashMap<>();
int expectedChildCount = 0;
int expectedTotalCount = 0;
for (int i = 0; i < valueCount; i++) {
for (int i = 0; i < docCount; i++) {
if (values[i] != null && values[i].length > 0) {
expectedTotalCount++;
for (long value : values[i]) {
Integer curCount = expected.get(value);
if (curCount == null) {
curCount = 0;
expectedChildCount++;
}
expected.put(value, curCount + 1);
}
setExpectedFrequencies(values[i], expected);
}
}
List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet());
int expectedChildCount = expected.size();
// sort by value
expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
@ -483,9 +478,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
});
int topN;
if (random().nextBoolean()) {
topN = valueCount;
topN = docCount;
} else {
topN = random().nextInt(valueCount);
topN = random().nextInt(docCount);
}
if (VERBOSE) {
System.out.println(" topN=" + topN);
@ -500,8 +495,8 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
topN);
// subset of docs
int minId = random().nextInt(valueCount);
int maxId = random().nextInt(valueCount);
int minId = random().nextInt(docCount);
int maxId = random().nextInt(docCount);
if (minId > maxId) {
int tmp = minId;
minId = maxId;
@ -517,22 +512,15 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
facetCounts = new LongValueFacetCounts("field", fc);
expected = new HashMap<>();
expectedChildCount = 0;
int totCount = 0;
expectedTotalCount = 0;
for (int i = minId; i <= maxId; i++) {
if (values[i] != null && values[i].length > 0) {
totCount++;
for (long value : values[i]) {
Integer curCount = expected.get(value);
if (curCount == null) {
expectedChildCount++;
curCount = 0;
}
expected.put(value, curCount + 1);
}
expectedTotalCount++;
setExpectedFrequencies(values[i], expected);
}
}
expectedCounts = new ArrayList<>(expected.entrySet());
expectedChildCount = expected.size();
// sort by value
expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
@ -541,7 +529,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
"id " + minId + "-" + maxId + ", sort facets by value",
expectedCounts,
expectedChildCount,
totCount,
expectedTotalCount,
actual,
Integer.MAX_VALUE);
@ -556,16 +544,16 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
return cmp;
});
if (random().nextBoolean()) {
topN = valueCount;
topN = docCount;
} else {
topN = random().nextInt(valueCount);
topN = random().nextInt(docCount);
}
actual = facetCounts.getTopChildrenSortByCount(topN);
assertSame(
"id " + minId + "-" + maxId + ", sort facets by count",
expectedCounts,
expectedChildCount,
totCount,
expectedTotalCount,
actual,
topN);
}
@ -573,6 +561,17 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
dir.close();
}
private void setExpectedFrequencies(long[] values, Map<Long, Integer> expected) {
long previousValue = 0;
for (int j = 0; j < values.length; j++) {
if (j == 0 || previousValue != values[j]) {
Integer curCount = expected.getOrDefault(values[j], 0);
expected.put(values[j], curCount + 1);
}
previousValue = values[j];
}
}
private static void assertSame(
String desc,
List<Map.Entry<Long, Integer>> expectedCounts,
@ -619,4 +618,35 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
actual.labelValues[i].value.intValue());
}
}
/**
* LUCENE-9964: Duplicate long values in a document field should only be counted once when using
* SortedNumericDocValuesFields
*/
public void testDuplicateLongValues() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
// these two values are not unique in a document
doc.add(new SortedNumericDocValuesField("field", 42));
doc.add(new SortedNumericDocValuesField("field", 42));
w.addDocument(doc);
doc = new Document();
doc.add(new SortedNumericDocValuesField("field", 43));
doc.add(new SortedNumericDocValuesField("field", 43));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
LongValueFacetCounts facetCounts = new LongValueFacetCounts("field", r);
FacetResult fr = facetCounts.getAllChildrenSortByValue();
for (LabelAndValue labelAndValue : fr.labelValues) {
assert labelAndValue.value.equals(1);
}
r.close();
dir.close();
}
}