From ded8efa82aa30f850f552872daea6036df8d1220 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 26 Nov 2019 11:49:57 +0100 Subject: [PATCH] LUCENE-9063: Speed up computation of impacts. (#1038) The current design of CompetitiveImpactAccumulator treats norms in -128..127 as a special case that should be optimized. This commit goes a bit further by treating it as the normal case, and only ever adding impacts to the TreeSet if the norm is outside of the byte range. It avoids a number of operations on TreeSets like adding impacts or removing redundant impacts. --- .../codecs/lucene50/Lucene50SkipWriter.java | 7 +- .../codecs/CompetitiveImpactAccumulator.java | 86 ++++++++++++++----- .../codecs/lucene84/Lucene84SkipWriter.java | 7 +- .../TestCompetitiveFreqNormAccumulator.java | 34 ++++---- 4 files changed, 87 insertions(+), 47 deletions(-) diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50SkipWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50SkipWriter.java index 9b9b4de3124..ef641beda22 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50SkipWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50SkipWriter.java @@ -19,8 +19,7 @@ package org.apache.lucene.codecs.lucene50; import java.io.IOException; import java.util.Arrays; -import java.util.Set; -import java.util.SortedSet; +import java.util.Collection; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; import org.apache.lucene.codecs.MultiLevelSkipListWriter; @@ -141,7 +140,7 @@ final class Lucene50SkipWriter extends MultiLevelSkipListWriter { // sets of competitive freq,norm pairs should be empty at this point assert Arrays.stream(curCompetitiveFreqNorms) .map(CompetitiveImpactAccumulator::getCompetitiveFreqNormPairs) - .mapToInt(Set::size) + .mapToInt(Collection::size) .sum() == 0; initialized = true; } @@ -205,7 +204,7 @@ final class Lucene50SkipWriter extends MultiLevelSkipListWriter { } static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out) throws IOException { - SortedSet impacts = acc.getCompetitiveFreqNormPairs(); + Collection impacts = acc.getCompetitiveFreqNormPairs(); Impact previous = new Impact(0, 0); for (Impact impact : impacts) { assert impact.freq > previous.freq; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/CompetitiveImpactAccumulator.java b/lucene/core/src/java/org/apache/lucene/codecs/CompetitiveImpactAccumulator.java index 34f7d79603f..d887c4f3018 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/CompetitiveImpactAccumulator.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/CompetitiveImpactAccumulator.java @@ -16,11 +16,13 @@ */ package org.apache.lucene.codecs; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; -import java.util.SortedSet; +import java.util.List; import java.util.TreeSet; import org.apache.lucene.index.Impact; @@ -30,11 +32,14 @@ import org.apache.lucene.index.Impact; */ public final class CompetitiveImpactAccumulator { - // We speed up accumulation for common norm values by first computing - // the max freq for all norms in -128..127 + // We speed up accumulation for common norm values with this array that maps + // norm values in -128..127 to the maximum frequency observed for these norm + // values private final int[] maxFreqs; - private boolean dirty; - private final TreeSet freqNormPairs; + // This TreeSet stores competitive (freq,norm) pairs for norm values that fall + // outside of -128..127. It is always empty with the default similarity, which + // encodes norms as bytes. + private final TreeSet otherFreqNormPairs; /** Sole constructor. */ public CompetitiveImpactAccumulator() { @@ -51,14 +56,14 @@ public final class CompetitiveImpactAccumulator { return cmp; } }; - freqNormPairs = new TreeSet<>(comparator); + otherFreqNormPairs = new TreeSet<>(comparator); } /** Reset to the same state it was in after creation. */ public void clear() { Arrays.fill(maxFreqs, 0); - dirty = false; - freqNormPairs.clear(); + otherFreqNormPairs.clear(); + assertConsistent(); } /** Accumulate a (freq,norm) pair, updating this structure if there is no @@ -67,34 +72,52 @@ public final class CompetitiveImpactAccumulator { if (norm >= Byte.MIN_VALUE && norm <= Byte.MAX_VALUE) { int index = Byte.toUnsignedInt((byte) norm); maxFreqs[index] = Math.max(maxFreqs[index], freq); - dirty = true; } else { - add(new Impact(freq, norm)); + add(new Impact(freq, norm), otherFreqNormPairs); } + assertConsistent(); } /** Merge {@code acc} into this. */ public void addAll(CompetitiveImpactAccumulator acc) { - for (Impact entry : acc.getCompetitiveFreqNormPairs()) { - add(entry); + int[] maxFreqs = this.maxFreqs; + int[] otherMaxFreqs = acc.maxFreqs; + for (int i = 0; i < maxFreqs.length; ++i) { + maxFreqs[i] = Math.max(maxFreqs[i], otherMaxFreqs[i]); } + + for (Impact entry : acc.otherFreqNormPairs) { + add(entry, otherFreqNormPairs); + } + + assertConsistent(); } /** Get the set of competitive freq and norm pairs, orderer by increasing freq and norm. */ - public SortedSet getCompetitiveFreqNormPairs() { - if (dirty) { - for (int i = 0; i < maxFreqs.length; ++i) { - if (maxFreqs[i] > 0) { - add(new Impact(maxFreqs[i], (byte) i)); - maxFreqs[i] = 0; - } + public Collection getCompetitiveFreqNormPairs() { + List impacts = new ArrayList<>(); + int maxFreqForLowerNorms = 0; + for (int i = 0; i < maxFreqs.length; ++i) { + int maxFreq = maxFreqs[i]; + if (maxFreq > maxFreqForLowerNorms) { + impacts.add(new Impact(maxFreq, (byte) i)); + maxFreqForLowerNorms = maxFreq; } - dirty = false; } - return Collections.unmodifiableSortedSet(freqNormPairs); + + if (otherFreqNormPairs.isEmpty()) { + // Common case: all norms are bytes + return impacts; + } + + TreeSet freqNormPairs = new TreeSet<>(this.otherFreqNormPairs); + for (Impact impact : impacts) { + add(impact, freqNormPairs); + } + return Collections.unmodifiableSet(freqNormPairs); } - private void add(Impact newEntry) { + private void add(Impact newEntry, TreeSet freqNormPairs) { Impact next = freqNormPairs.ceiling(newEntry); if (next == null) { // nothing is more competitive @@ -122,6 +145,23 @@ public final class CompetitiveImpactAccumulator { @Override public String toString() { - return getCompetitiveFreqNormPairs().toString(); + return new ArrayList<>(getCompetitiveFreqNormPairs()).toString(); + } + + // Only called by assertions + private boolean assertConsistent() { + for (int freq : maxFreqs) { + assert freq >= 0; + } + int previousFreq = 0; + long previousNorm = 0; + for (Impact impact : otherFreqNormPairs) { + assert impact.norm < Byte.MIN_VALUE || impact.norm > Byte.MAX_VALUE; + assert previousFreq < impact.freq; + assert Long.compareUnsigned(previousNorm, impact.norm) < 0; + previousFreq = impact.freq; + previousNorm = impact.norm; + } + return true; } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84SkipWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84SkipWriter.java index 75131507844..6e8fd89bd80 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84SkipWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84SkipWriter.java @@ -19,8 +19,7 @@ package org.apache.lucene.codecs.lucene84; import java.io.IOException; import java.util.Arrays; -import java.util.Set; -import java.util.SortedSet; +import java.util.Collection; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; import org.apache.lucene.codecs.MultiLevelSkipListWriter; @@ -141,7 +140,7 @@ final class Lucene84SkipWriter extends MultiLevelSkipListWriter { // sets of competitive freq,norm pairs should be empty at this point assert Arrays.stream(curCompetitiveFreqNorms) .map(CompetitiveImpactAccumulator::getCompetitiveFreqNormPairs) - .mapToInt(Set::size) + .mapToInt(Collection::size) .sum() == 0; initialized = true; } @@ -205,7 +204,7 @@ final class Lucene84SkipWriter extends MultiLevelSkipListWriter { } static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out) throws IOException { - SortedSet impacts = acc.getCompetitiveFreqNormPairs(); + Collection impacts = acc.getCompetitiveFreqNormPairs(); Impact previous = new Impact(0, 0); for (Impact impact : impacts) { assert impact.freq > previous.freq; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/TestCompetitiveFreqNormAccumulator.java b/lucene/core/src/test/org/apache/lucene/codecs/TestCompetitiveFreqNormAccumulator.java index f4d3e69319b..4fca1cccd15 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/TestCompetitiveFreqNormAccumulator.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/TestCompetitiveFreqNormAccumulator.java @@ -17,8 +17,10 @@ package org.apache.lucene.codecs; import java.util.Collections; -import java.util.HashSet; +import java.util.Comparator; +import java.util.List; import java.util.Set; +import java.util.TreeSet; import org.apache.lucene.index.Impact; import org.apache.lucene.util.LuceneTestCase; @@ -27,59 +29,59 @@ public class TestCompetitiveFreqNormAccumulator extends LuceneTestCase { public void testBasics() { CompetitiveImpactAccumulator acc = new CompetitiveImpactAccumulator(); - Set expected = new HashSet<>(); + Set expected = new TreeSet<>(Comparator.comparingInt(i -> i.freq)); acc.add(3, 5); expected.add(new Impact(3, 5)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); acc.add(6, 11); expected.add(new Impact(6, 11)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); acc.add(10, 13); expected.add(new Impact(10, 13)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); acc.add(1, 2); expected.add(new Impact(1, 2)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); acc.add(7, 9); expected.remove(new Impact(6, 11)); expected.add(new Impact(7, 9)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); acc.add(8, 2); expected.clear(); expected.add(new Impact(10, 13)); expected.add(new Impact(8, 2)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); } public void testExtremeNorms() { CompetitiveImpactAccumulator acc = new CompetitiveImpactAccumulator(); - Set expected = new HashSet<>(); + Set expected = new TreeSet<>(Comparator.comparingInt(i -> i.freq)); acc.add(3, 5); expected.add(new Impact(3, 5)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); acc.add(10, 10000); expected.add(new Impact(10, 10000)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); acc.add(5, 200); expected.add(new Impact(5, 200)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); acc.add(20, -100); expected.add(new Impact(20, -100)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); acc.add(30, -3); expected.add(new Impact(30, -3)); - assertEquals(expected, acc.getCompetitiveFreqNormPairs()); + assertEquals(List.copyOf(expected), List.copyOf(acc.getCompetitiveFreqNormPairs())); } public void testOmitFreqs() { @@ -89,7 +91,7 @@ public class TestCompetitiveFreqNormAccumulator extends LuceneTestCase { acc.add(1, 7); acc.add(1, 4); - assertEquals(Collections.singleton(new Impact(1, 4)), acc.getCompetitiveFreqNormPairs()); + assertEquals(Collections.singletonList(new Impact(1, 4)), List.copyOf(acc.getCompetitiveFreqNormPairs())); } public void testOmitNorms() { @@ -99,6 +101,6 @@ public class TestCompetitiveFreqNormAccumulator extends LuceneTestCase { acc.add(7, 1); acc.add(4, 1); - assertEquals(Collections.singleton(new Impact(7, 1)), acc.getCompetitiveFreqNormPairs()); + assertEquals(Collections.singletonList(new Impact(7, 1)), List.copyOf(acc.getCompetitiveFreqNormPairs())); } }