mirror of https://github.com/apache/lucene.git
LUCENE-9964: Duplicate long values in a field should only be counted once when using SortedNumericDocValuesFields (#191)
This commit is contained in:
parent
834041f286
commit
a0d995d0c4
|
@ -380,6 +380,9 @@ Bug Fixes
|
|||
---------------------
|
||||
* LUCENE-9988: Fix DrillSideways correctness bug introduced in LUCENE-9944 (Greg Miller)
|
||||
|
||||
* LUCENE-9964: Duplicate long values in a document field should only be counted once when using SortedNumericDocValuesFields
|
||||
(Gautam Worah)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
(No changes)
|
||||
|
|
|
@ -162,8 +162,14 @@ public class LongValueFacetCounts extends Facets {
|
|||
if (limit > 0) {
|
||||
totCount++;
|
||||
}
|
||||
long previousValue = 0;
|
||||
for (int i = 0; i < limit; i++) {
|
||||
increment(multiValues.nextValue());
|
||||
long value = multiValues.nextValue();
|
||||
// do not increment the count for duplicate values
|
||||
if (i == 0 || value != previousValue) {
|
||||
increment(value);
|
||||
}
|
||||
previousValue = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -208,8 +214,14 @@ public class LongValueFacetCounts extends Facets {
|
|||
if (limit > 0) {
|
||||
totCount++;
|
||||
}
|
||||
long previousValue = 0;
|
||||
for (int i = 0; i < limit; i++) {
|
||||
increment(multiValues.nextValue());
|
||||
long value = multiValues.nextValue();
|
||||
// do not increment the count for duplicate values
|
||||
if (i == 0 || value != previousValue) {
|
||||
increment(value);
|
||||
}
|
||||
previousValue = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -131,11 +131,11 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
d.close();
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
public void testRandomSingleValued() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
|
||||
int valueCount = atLeast(1000);
|
||||
int docCount = atLeast(1000);
|
||||
double missingChance = random().nextDouble();
|
||||
long maxValue;
|
||||
if (random().nextBoolean()) {
|
||||
|
@ -146,7 +146,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
"TEST: valueCount="
|
||||
+ valueCount
|
||||
+ docCount
|
||||
+ " valueRange=-"
|
||||
+ maxValue
|
||||
+ "-"
|
||||
|
@ -154,9 +154,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
+ " missingChance="
|
||||
+ missingChance);
|
||||
}
|
||||
Long[] values = new Long[valueCount];
|
||||
Long[] values = new Long[docCount];
|
||||
int missingCount = 0;
|
||||
for (int i = 0; i < valueCount; i++) {
|
||||
for (int i = 0; i < docCount; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new IntPoint("id", i));
|
||||
if (random().nextDouble() > missingChance) {
|
||||
|
@ -185,7 +185,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
// all docs
|
||||
Map<Long, Integer> expected = new HashMap<>();
|
||||
int expectedChildCount = 0;
|
||||
for (int i = 0; i < valueCount; i++) {
|
||||
for (int i = 0; i < docCount; i++) {
|
||||
if (values[i] != null) {
|
||||
Integer curCount = expected.get(values[i]);
|
||||
if (curCount == null) {
|
||||
|
@ -237,7 +237,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
"all docs, sort facets by value",
|
||||
expectedCounts,
|
||||
expectedChildCount,
|
||||
valueCount - missingCount,
|
||||
docCount - missingCount,
|
||||
actual,
|
||||
Integer.MAX_VALUE);
|
||||
|
||||
|
@ -253,9 +253,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
});
|
||||
int topN;
|
||||
if (random().nextBoolean()) {
|
||||
topN = valueCount;
|
||||
topN = docCount;
|
||||
} else {
|
||||
topN = random().nextInt(valueCount);
|
||||
topN = random().nextInt(docCount);
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println(" topN=" + topN);
|
||||
|
@ -265,13 +265,13 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
"all docs, sort facets by count",
|
||||
expectedCounts,
|
||||
expectedChildCount,
|
||||
valueCount - missingCount,
|
||||
docCount - missingCount,
|
||||
actual,
|
||||
topN);
|
||||
|
||||
// subset of docs
|
||||
int minId = random().nextInt(valueCount);
|
||||
int maxId = random().nextInt(valueCount);
|
||||
int minId = random().nextInt(docCount);
|
||||
int maxId = random().nextInt(docCount);
|
||||
if (minId > maxId) {
|
||||
int tmp = minId;
|
||||
minId = maxId;
|
||||
|
@ -334,9 +334,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
return cmp;
|
||||
});
|
||||
if (random().nextBoolean()) {
|
||||
topN = valueCount;
|
||||
topN = docCount;
|
||||
} else {
|
||||
topN = random().nextInt(valueCount);
|
||||
topN = random().nextInt(docCount);
|
||||
}
|
||||
actual = facetCounts.getTopChildrenSortByCount(topN);
|
||||
assertSame(
|
||||
|
@ -355,7 +355,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
|
||||
int valueCount = atLeast(1000);
|
||||
int docCount = atLeast(1000);
|
||||
double missingChance = random().nextDouble();
|
||||
|
||||
// sometimes exercise codec optimizations when a claimed multi valued field is in fact single
|
||||
|
@ -371,7 +371,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
"TEST: valueCount="
|
||||
+ valueCount
|
||||
+ docCount
|
||||
+ " valueRange=-"
|
||||
+ maxValue
|
||||
+ "-"
|
||||
|
@ -382,8 +382,8 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
+ allSingleValued);
|
||||
}
|
||||
|
||||
long[][] values = new long[valueCount][];
|
||||
for (int i = 0; i < valueCount; i++) {
|
||||
long[][] values = new long[docCount][];
|
||||
for (int i = 0; i < docCount; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new IntPoint("id", i));
|
||||
if (random().nextDouble() > missingChance) {
|
||||
|
@ -403,6 +403,8 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
System.out.println(" doc=" + i + " values=" + Arrays.toString(values[i]));
|
||||
}
|
||||
|
||||
// sort values to enable duplicate detection by comparing with the previous value
|
||||
Arrays.sort(values[i]);
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" doc=" + i + " missing values");
|
||||
|
@ -426,23 +428,16 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
|
||||
// all docs
|
||||
Map<Long, Integer> expected = new HashMap<>();
|
||||
int expectedChildCount = 0;
|
||||
int expectedTotalCount = 0;
|
||||
for (int i = 0; i < valueCount; i++) {
|
||||
for (int i = 0; i < docCount; i++) {
|
||||
if (values[i] != null && values[i].length > 0) {
|
||||
expectedTotalCount++;
|
||||
for (long value : values[i]) {
|
||||
Integer curCount = expected.get(value);
|
||||
if (curCount == null) {
|
||||
curCount = 0;
|
||||
expectedChildCount++;
|
||||
}
|
||||
expected.put(value, curCount + 1);
|
||||
}
|
||||
setExpectedFrequencies(values[i], expected);
|
||||
}
|
||||
}
|
||||
|
||||
List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet());
|
||||
int expectedChildCount = expected.size();
|
||||
|
||||
// sort by value
|
||||
expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
|
||||
|
@ -483,9 +478,9 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
});
|
||||
int topN;
|
||||
if (random().nextBoolean()) {
|
||||
topN = valueCount;
|
||||
topN = docCount;
|
||||
} else {
|
||||
topN = random().nextInt(valueCount);
|
||||
topN = random().nextInt(docCount);
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println(" topN=" + topN);
|
||||
|
@ -500,8 +495,8 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
topN);
|
||||
|
||||
// subset of docs
|
||||
int minId = random().nextInt(valueCount);
|
||||
int maxId = random().nextInt(valueCount);
|
||||
int minId = random().nextInt(docCount);
|
||||
int maxId = random().nextInt(docCount);
|
||||
if (minId > maxId) {
|
||||
int tmp = minId;
|
||||
minId = maxId;
|
||||
|
@ -517,22 +512,15 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
facetCounts = new LongValueFacetCounts("field", fc);
|
||||
|
||||
expected = new HashMap<>();
|
||||
expectedChildCount = 0;
|
||||
int totCount = 0;
|
||||
expectedTotalCount = 0;
|
||||
for (int i = minId; i <= maxId; i++) {
|
||||
if (values[i] != null && values[i].length > 0) {
|
||||
totCount++;
|
||||
for (long value : values[i]) {
|
||||
Integer curCount = expected.get(value);
|
||||
if (curCount == null) {
|
||||
expectedChildCount++;
|
||||
curCount = 0;
|
||||
}
|
||||
expected.put(value, curCount + 1);
|
||||
}
|
||||
expectedTotalCount++;
|
||||
setExpectedFrequencies(values[i], expected);
|
||||
}
|
||||
}
|
||||
expectedCounts = new ArrayList<>(expected.entrySet());
|
||||
expectedChildCount = expected.size();
|
||||
|
||||
// sort by value
|
||||
expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
|
||||
|
@ -541,7 +529,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
"id " + minId + "-" + maxId + ", sort facets by value",
|
||||
expectedCounts,
|
||||
expectedChildCount,
|
||||
totCount,
|
||||
expectedTotalCount,
|
||||
actual,
|
||||
Integer.MAX_VALUE);
|
||||
|
||||
|
@ -556,16 +544,16 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
return cmp;
|
||||
});
|
||||
if (random().nextBoolean()) {
|
||||
topN = valueCount;
|
||||
topN = docCount;
|
||||
} else {
|
||||
topN = random().nextInt(valueCount);
|
||||
topN = random().nextInt(docCount);
|
||||
}
|
||||
actual = facetCounts.getTopChildrenSortByCount(topN);
|
||||
assertSame(
|
||||
"id " + minId + "-" + maxId + ", sort facets by count",
|
||||
expectedCounts,
|
||||
expectedChildCount,
|
||||
totCount,
|
||||
expectedTotalCount,
|
||||
actual,
|
||||
topN);
|
||||
}
|
||||
|
@ -573,6 +561,17 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
private void setExpectedFrequencies(long[] values, Map<Long, Integer> expected) {
|
||||
long previousValue = 0;
|
||||
for (int j = 0; j < values.length; j++) {
|
||||
if (j == 0 || previousValue != values[j]) {
|
||||
Integer curCount = expected.getOrDefault(values[j], 0);
|
||||
expected.put(values[j], curCount + 1);
|
||||
}
|
||||
previousValue = values[j];
|
||||
}
|
||||
}
|
||||
|
||||
private static void assertSame(
|
||||
String desc,
|
||||
List<Map.Entry<Long, Integer>> expectedCounts,
|
||||
|
@ -619,4 +618,35 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
actual.labelValues[i].value.intValue());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* LUCENE-9964: Duplicate long values in a document field should only be counted once when using
|
||||
* SortedNumericDocValuesFields
|
||||
*/
|
||||
public void testDuplicateLongValues() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
|
||||
|
||||
Document doc = new Document();
|
||||
// these two values are not unique in a document
|
||||
doc.add(new SortedNumericDocValuesField("field", 42));
|
||||
doc.add(new SortedNumericDocValuesField("field", 42));
|
||||
w.addDocument(doc);
|
||||
doc = new Document();
|
||||
doc.add(new SortedNumericDocValuesField("field", 43));
|
||||
doc.add(new SortedNumericDocValuesField("field", 43));
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexReader r = w.getReader();
|
||||
w.close();
|
||||
LongValueFacetCounts facetCounts = new LongValueFacetCounts("field", r);
|
||||
|
||||
FacetResult fr = facetCounts.getAllChildrenSortByValue();
|
||||
for (LabelAndValue labelAndValue : fr.labelValues) {
|
||||
assert labelAndValue.value.equals(1);
|
||||
}
|
||||
|
||||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue