From abd282d258d23d19b7f7c1e96332a19fa7b7b827 Mon Sep 17 00:00:00 2001 From: Mike Date: Thu, 6 Feb 2020 12:16:45 -0800 Subject: [PATCH] LUCENE-9142 Refactor IntSet operations for determinize (#1184) * LUCENE-9142 Refactor SortedIntSet for equality Split SortedIntSet into a class heirarchy to make comparisons to FrozenIntSet more meaningful. Use Arrays.equals for more efficient comparison. Add tests for IntSet to verify correctness. --- .../lucene/util/automaton/FrozenIntSet.java | 51 +++++++ .../apache/lucene/util/automaton/IntSet.java | 46 ++++++ .../lucene/util/automaton/Operations.java | 11 +- .../lucene/util/automaton/SortedIntSet.java | 139 ++++-------------- .../lucene/util/automaton/TestIntSet.java | 94 ++++++++++++ 5 files changed, 222 insertions(+), 119 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/automaton/FrozenIntSet.java create mode 100644 lucene/core/src/java/org/apache/lucene/util/automaton/IntSet.java create mode 100644 lucene/core/src/test/org/apache/lucene/util/automaton/TestIntSet.java diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/FrozenIntSet.java b/lucene/core/src/java/org/apache/lucene/util/automaton/FrozenIntSet.java new file mode 100644 index 00000000000..02da6647a75 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/FrozenIntSet.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.automaton; + +import java.util.Arrays; + +final class FrozenIntSet extends IntSet { + final int[] values; + final int hashCode; + final int state; + + FrozenIntSet(int[] values, int hashCode, int state) { + this.values = values; + this.hashCode = hashCode; + this.state = state; + } + + @Override + int[] getArray() { + return values; + } + + @Override + int size() { + return values.length; + } + + @Override + public int hashCode() { + return hashCode; + } + + @Override + public String toString() { + return Arrays.toString(values); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/IntSet.java b/lucene/core/src/java/org/apache/lucene/util/automaton/IntSet.java new file mode 100644 index 00000000000..667a0c5432a --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/IntSet.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.automaton; + +import java.util.Arrays; + +abstract class IntSet { + /** + * Return an array representation of this int set's values. Values are valid for indices [0, {@link #size()}). + * If this is a mutable int set, then changes to the set are not guaranteed to be visible in this array. + * @return an array containing the values for this set, guaranteed to be at least {@link #size()} elements + */ + abstract int[] getArray(); + + /** + * Guaranteed to be less than or equal to the length of the array returned by {@link #getArray()}. + * @return The number of values in this set. + */ + abstract int size(); + + @Override + public abstract int hashCode(); + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof IntSet)) return false; + IntSet that = (IntSet) o; + return hashCode() == that.hashCode() + && Arrays.equals(getArray(), 0, size(), that.getArray(), 0, that.size()); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 8ed1b12be2e..844c484d441 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -682,13 +682,14 @@ final public class Operations { //System.out.println("DET:"); //a.writeDot("/l/la/lucene/core/detin.dot"); - SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(0, 0); + // Same initial values and state will always have the same hashCode + FrozenIntSet initialset = new FrozenIntSet(new int[] { 0 }, 683, 0); // Create state 0: b.createState(); - ArrayDeque worklist = new ArrayDeque<>(); - Map newstate = new HashMap<>(); + ArrayDeque worklist = new ArrayDeque<>(); + Map newstate = new HashMap<>(); worklist.add(initialset); @@ -704,7 +705,7 @@ final public class Operations { Transition t = new Transition(); while (worklist.size() > 0) { - SortedIntSet.FrozenIntSet s = worklist.removeFirst(); + FrozenIntSet s = worklist.removeFirst(); //System.out.println("det: pop set=" + s); // Collate all outgoing transitions by min/1+max: @@ -745,7 +746,7 @@ final public class Operations { if (q >= maxDeterminizedStates) { throw new TooComplexToDeterminizeException(a, maxDeterminizedStates); } - final SortedIntSet.FrozenIntSet p = statesSet.freeze(q); + final FrozenIntSet p = statesSet.freeze(q); //System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount); worklist.add(p); b.setAccept(q, accCount > 0); diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/SortedIntSet.java b/lucene/core/src/java/org/apache/lucene/util/automaton/SortedIntSet.java index 3251aad9f09..2c98c658e64 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/SortedIntSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/SortedIntSet.java @@ -21,9 +21,8 @@ import java.util.Map; import org.apache.lucene.util.ArrayUtil; // Just holds a set of int[] states, plus a corresponding -// int[] count per state. Used by -// BasicOperations.determinize -final class SortedIntSet { +// int[] count per state. Used by Operations.determinize +final class SortedIntSet extends IntSet { int[] values; int[] counts; int upto; @@ -37,9 +36,7 @@ final class SortedIntSet { private boolean useTreeMap; - int state; - - public SortedIntSet(int capacity) { + SortedIntSet(int capacity) { values = new int[capacity]; counts = new int[capacity]; } @@ -47,13 +44,7 @@ final class SortedIntSet { // Adds this state to the set public void incr(int num) { if (useTreeMap) { - final Integer key = num; - Integer val = map.get(key); - if (val == null) { - map.put(key, 1); - } else { - map.put(key, 1+val); - } + map.merge(num, 1, Integer::sum); return; } @@ -130,7 +121,7 @@ final class SortedIntSet { assert false; } - public void computeHash() { + void computeHash() { if (useTreeMap) { if (map.size() > values.length) { final int size = ArrayUtil.oversize(map.size(), Integer.BYTES); @@ -151,41 +142,34 @@ final class SortedIntSet { } } - public FrozenIntSet freeze(int state) { - final int[] c = new int[upto]; - System.arraycopy(values, 0, c, 0, upto); + /** + * Create a snapshot of this int set associated with a given state. The snapshot will not retain any frequency + * information about the elements of this set, only existence. + *

+ * It is the caller's responsibility to ensure that the hashCode and data are up to date via the {@link #computeHash()} method before calling this method. + * @param state the state to associate with the frozen set. + * @return A new FrozenIntSet with the same values as this set. + */ + FrozenIntSet freeze(int state) { + final int[] c = ArrayUtil.copyOfSubArray(values, 0, upto); return new FrozenIntSet(c, hashCode, state); } + @Override + int[] getArray() { + return values; + } + + @Override + int size() { + return upto; + } + @Override public int hashCode() { return hashCode; } - @Override - public boolean equals(Object _other) { - if (_other == null) { - return false; - } - if (!(_other instanceof FrozenIntSet)) { - return false; - } - FrozenIntSet other = (FrozenIntSet) _other; - if (hashCode != other.hashCode) { - return false; - } - if (other.values.length != upto) { - return false; - } - for(int i=0;i 0) { - sb.append(' '); - } - sb.append(values[i]); - } - sb.append(']'); - return sb.toString(); - } - } } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestIntSet.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestIntSet.java new file mode 100644 index 00000000000..47bc9f53979 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestIntSet.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.automaton; + +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + +public class TestIntSet extends LuceneTestCase { + @Test + public void testFreezeEqualitySmallSet() { + testFreezeEquality(10); + } + + @Test + public void testFreezeEqualityLargeSet() { + testFreezeEquality(100); + } + + private void testFreezeEquality(int size) { + SortedIntSet sortedSet = new SortedIntSet(0); + + for (int i = 0; i < size; i++) { + // Some duplicates is nice but not critical + sortedSet.incr(random().nextInt(i + 1)); + } + + sortedSet.computeHash(); + IntSet frozen0 = sortedSet.freeze(0); + + assertEquals("Frozen set not equal to origin sorted set.", sortedSet, frozen0); + assertEquals("Symmetry: Sorted set not equal to frozen set.", frozen0, sortedSet); + + IntSet frozen1 = sortedSet.freeze(random().nextInt()); + assertEquals("Sorted set modified while freezing?", sortedSet, frozen1); + assertEquals("Frozen sets were not equal", frozen0, frozen1); + } + + @Test + public void testMapCutover() { + SortedIntSet set = new SortedIntSet(10); + for (int i = 0; i < 35; i++) { + // No duplicates so there are enough elements to trigger impl cutover + set.incr(i); + } + + set.computeHash(); + assertTrue(set.size() > 32); + + for (int i = 0; i < 35; i++) { + // This is pretty much the worst case, perf wise + set.decr(i); + } + + set.computeHash(); + assertTrue(set.size() == 0); + } + + @Test + public void testModify() { + SortedIntSet set = new SortedIntSet(2); + set.incr(1); + set.incr(2); + set.computeHash(); + + FrozenIntSet set2 = set.freeze(0); + assertEquals(set, set2); + + set.incr(1); + set.computeHash(); + assertEquals(set, set2); + + set.decr(1); + set.computeHash(); + assertEquals(set, set2); + + set.decr(1); + set.computeHash(); + assertNotEquals(set, set2); + } +} \ No newline at end of file