mirror of https://github.com/apache/lucene.git
UCENE-9063: Speed up computation of impacts. (#1038)
The current design of CompetitiveImpactAccumulator treats norms in -128..127 as a special case that should be optimized. This commit goes a bit further by treating it as the normal case, and only ever adding impacts to the TreeSet if the norm is outside of the byte range. It avoids a number of operations on TreeSets like adding impacts or removing redundant impacts.
This commit is contained in:
parent
f76f4eaff8
commit
d1ef153d42
|
@ -19,8 +19,7 @@ package org.apache.lucene.codecs.lucene50;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.Collection;
|
||||
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
|
@ -140,7 +139,7 @@ final class Lucene50SkipWriter extends MultiLevelSkipListWriter {
|
|||
// sets of competitive freq,norm pairs should be empty at this point
|
||||
assert Arrays.stream(curCompetitiveFreqNorms)
|
||||
.map(CompetitiveImpactAccumulator::getCompetitiveFreqNormPairs)
|
||||
.mapToInt(Set::size)
|
||||
.mapToInt(Collection::size)
|
||||
.sum() == 0;
|
||||
initialized = true;
|
||||
}
|
||||
|
@ -204,7 +203,7 @@ final class Lucene50SkipWriter extends MultiLevelSkipListWriter {
|
|||
}
|
||||
|
||||
static void writeImpacts(CompetitiveImpactAccumulator acc, IndexOutput out) throws IOException {
|
||||
SortedSet<Impact> impacts = acc.getCompetitiveFreqNormPairs();
|
||||
Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs();
|
||||
Impact previous = new Impact(0, 0);
|
||||
for (Impact impact : impacts) {
|
||||
assert impact.freq > previous.freq;
|
||||
|
|
|
@ -16,11 +16,13 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.SortedSet;
|
||||
import java.util.List;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.index.Impact;
|
||||
|
@ -30,11 +32,14 @@ import org.apache.lucene.index.Impact;
|
|||
*/
|
||||
public final class CompetitiveImpactAccumulator {
|
||||
|
||||
// We speed up accumulation for common norm values by first computing
|
||||
// the max freq for all norms in -128..127
|
||||
// We speed up accumulation for common norm values with this array that maps
|
||||
// norm values in -128..127 to the maximum frequency observed for these norm
|
||||
// values
|
||||
private final int[] maxFreqs;
|
||||
private boolean dirty;
|
||||
private final TreeSet<Impact> freqNormPairs;
|
||||
// This TreeSet stores competitive (freq,norm) pairs for norm values that fall
|
||||
// outside of -128..127. It is always empty with the default similarity, which
|
||||
// encodes norms as bytes.
|
||||
private final TreeSet<Impact> otherFreqNormPairs;
|
||||
|
||||
/** Sole constructor. */
|
||||
public CompetitiveImpactAccumulator() {
|
||||
|
@ -51,14 +56,14 @@ public final class CompetitiveImpactAccumulator {
|
|||
return cmp;
|
||||
}
|
||||
};
|
||||
freqNormPairs = new TreeSet<>(comparator);
|
||||
otherFreqNormPairs = new TreeSet<>(comparator);
|
||||
}
|
||||
|
||||
/** Reset to the same state it was in after creation. */
|
||||
public void clear() {
|
||||
Arrays.fill(maxFreqs, 0);
|
||||
dirty = false;
|
||||
freqNormPairs.clear();
|
||||
otherFreqNormPairs.clear();
|
||||
assertConsistent();
|
||||
}
|
||||
|
||||
/** Accumulate a (freq,norm) pair, updating this structure if there is no
|
||||
|
@ -67,34 +72,52 @@ public final class CompetitiveImpactAccumulator {
|
|||
if (norm >= Byte.MIN_VALUE && norm <= Byte.MAX_VALUE) {
|
||||
int index = Byte.toUnsignedInt((byte) norm);
|
||||
maxFreqs[index] = Math.max(maxFreqs[index], freq);
|
||||
dirty = true;
|
||||
} else {
|
||||
add(new Impact(freq, norm));
|
||||
add(new Impact(freq, norm), otherFreqNormPairs);
|
||||
}
|
||||
assertConsistent();
|
||||
}
|
||||
|
||||
/** Merge {@code acc} into this. */
|
||||
public void addAll(CompetitiveImpactAccumulator acc) {
|
||||
for (Impact entry : acc.getCompetitiveFreqNormPairs()) {
|
||||
add(entry);
|
||||
int[] maxFreqs = this.maxFreqs;
|
||||
int[] otherMaxFreqs = acc.maxFreqs;
|
||||
for (int i = 0; i < maxFreqs.length; ++i) {
|
||||
maxFreqs[i] = Math.max(maxFreqs[i], otherMaxFreqs[i]);
|
||||
}
|
||||
|
||||
for (Impact entry : acc.otherFreqNormPairs) {
|
||||
add(entry, otherFreqNormPairs);
|
||||
}
|
||||
|
||||
assertConsistent();
|
||||
}
|
||||
|
||||
/** Get the set of competitive freq and norm pairs, orderer by increasing freq and norm. */
|
||||
public SortedSet<Impact> getCompetitiveFreqNormPairs() {
|
||||
if (dirty) {
|
||||
public Collection<Impact> getCompetitiveFreqNormPairs() {
|
||||
List<Impact> impacts = new ArrayList<>();
|
||||
int maxFreqForLowerNorms = 0;
|
||||
for (int i = 0; i < maxFreqs.length; ++i) {
|
||||
if (maxFreqs[i] > 0) {
|
||||
add(new Impact(maxFreqs[i], (byte) i));
|
||||
maxFreqs[i] = 0;
|
||||
int maxFreq = maxFreqs[i];
|
||||
if (maxFreq > maxFreqForLowerNorms) {
|
||||
impacts.add(new Impact(maxFreq, (byte) i));
|
||||
maxFreqForLowerNorms = maxFreq;
|
||||
}
|
||||
}
|
||||
dirty = false;
|
||||
}
|
||||
return Collections.unmodifiableSortedSet(freqNormPairs);
|
||||
}
|
||||
|
||||
private void add(Impact newEntry) {
|
||||
if (otherFreqNormPairs.isEmpty()) {
|
||||
// Common case: all norms are bytes
|
||||
return impacts;
|
||||
}
|
||||
|
||||
TreeSet<Impact> freqNormPairs = new TreeSet<>(this.otherFreqNormPairs);
|
||||
for (Impact impact : impacts) {
|
||||
add(impact, freqNormPairs);
|
||||
}
|
||||
return Collections.unmodifiableSet(freqNormPairs);
|
||||
}
|
||||
|
||||
private void add(Impact newEntry, TreeSet<Impact> freqNormPairs) {
|
||||
Impact next = freqNormPairs.ceiling(newEntry);
|
||||
if (next == null) {
|
||||
// nothing is more competitive
|
||||
|
@ -122,6 +145,23 @@ public final class CompetitiveImpactAccumulator {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getCompetitiveFreqNormPairs().toString();
|
||||
return new ArrayList<>(getCompetitiveFreqNormPairs()).toString();
|
||||
}
|
||||
|
||||
// Only called by assertions
|
||||
private boolean assertConsistent() {
|
||||
for (int freq : maxFreqs) {
|
||||
assert freq >= 0;
|
||||
}
|
||||
int previousFreq = 0;
|
||||
long previousNorm = 0;
|
||||
for (Impact impact : otherFreqNormPairs) {
|
||||
assert impact.norm < Byte.MIN_VALUE || impact.norm > Byte.MAX_VALUE;
|
||||
assert previousFreq < impact.freq;
|
||||
assert Long.compareUnsigned(previousNorm, impact.norm) < 0;
|
||||
previousFreq = impact.freq;
|
||||
previousNorm = impact.norm;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,8 +19,7 @@ package org.apache.lucene.codecs.lucene84;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.Collection;
|
||||
|
||||
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||
|
@ -141,7 +140,7 @@ final class Lucene84SkipWriter extends MultiLevelSkipListWriter {
|
|||
// sets of competitive freq,norm pairs should be empty at this point
|
||||
assert Arrays.stream(curCompetitiveFreqNorms)
|
||||
.map(CompetitiveImpactAccumulator::getCompetitiveFreqNormPairs)
|
||||
.mapToInt(Set::size)
|
||||
.mapToInt(Collection::size)
|
||||
.sum() == 0;
|
||||
initialized = true;
|
||||
}
|
||||
|
@ -205,7 +204,7 @@ final class Lucene84SkipWriter extends MultiLevelSkipListWriter {
|
|||
}
|
||||
|
||||
static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out) throws IOException {
|
||||
SortedSet<Impact> impacts = acc.getCompetitiveFreqNormPairs();
|
||||
Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs();
|
||||
Impact previous = new Impact(0, 0);
|
||||
for (Impact impact : impacts) {
|
||||
assert impact.freq > previous.freq;
|
||||
|
|
|
@ -16,9 +16,11 @@
|
|||
*/
|
||||
package org.apache.lucene.codecs;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Comparator;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.index.Impact;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
@ -27,59 +29,59 @@ public class TestCompetitiveFreqNormAccumulator extends LuceneTestCase {
|
|||
|
||||
public void testBasics() {
|
||||
CompetitiveImpactAccumulator acc = new CompetitiveImpactAccumulator();
|
||||
Set<Impact> expected = new HashSet<>();
|
||||
Set<Impact> expected = new TreeSet<>(Comparator.comparingInt(i -> i.freq));
|
||||
|
||||
acc.add(3, 5);
|
||||
expected.add(new Impact(3, 5));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
|
||||
acc.add(6, 11);
|
||||
expected.add(new Impact(6, 11));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
|
||||
acc.add(10, 13);
|
||||
expected.add(new Impact(10, 13));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
|
||||
acc.add(1, 2);
|
||||
expected.add(new Impact(1, 2));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
|
||||
acc.add(7, 9);
|
||||
expected.remove(new Impact(6, 11));
|
||||
expected.add(new Impact(7, 9));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
|
||||
acc.add(8, 2);
|
||||
expected.clear();
|
||||
expected.add(new Impact(10, 13));
|
||||
expected.add(new Impact(8, 2));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
}
|
||||
|
||||
public void testExtremeNorms() {
|
||||
CompetitiveImpactAccumulator acc = new CompetitiveImpactAccumulator();
|
||||
Set<Impact> expected = new HashSet<>();
|
||||
Set<Impact> expected = new TreeSet<>(Comparator.comparingInt(i -> i.freq));
|
||||
|
||||
acc.add(3, 5);
|
||||
expected.add(new Impact(3, 5));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
|
||||
acc.add(10, 10000);
|
||||
expected.add(new Impact(10, 10000));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
|
||||
acc.add(5, 200);
|
||||
expected.add(new Impact(5, 200));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
|
||||
acc.add(20, -100);
|
||||
expected.add(new Impact(20, -100));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
|
||||
acc.add(30, -3);
|
||||
expected.add(new Impact(30, -3));
|
||||
assertEquals(expected, acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(new ArrayList<>(expected), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
}
|
||||
|
||||
public void testOmitFreqs() {
|
||||
|
@ -89,7 +91,7 @@ public class TestCompetitiveFreqNormAccumulator extends LuceneTestCase {
|
|||
acc.add(1, 7);
|
||||
acc.add(1, 4);
|
||||
|
||||
assertEquals(Collections.singleton(new Impact(1, 4)), acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(Collections.singletonList(new Impact(1, 4)), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
}
|
||||
|
||||
public void testOmitNorms() {
|
||||
|
@ -99,6 +101,6 @@ public class TestCompetitiveFreqNormAccumulator extends LuceneTestCase {
|
|||
acc.add(7, 1);
|
||||
acc.add(4, 1);
|
||||
|
||||
assertEquals(Collections.singleton(new Impact(7, 1)), acc.getCompetitiveFreqNormPairs());
|
||||
assertEquals(Collections.singletonList(new Impact(7, 1)), new ArrayList<>(acc.getCompetitiveFreqNormPairs()));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue