LUCENE-9142 Refactor IntSet operations for determinize (#1184)

* LUCENE-9142 Refactor SortedIntSet for equality

Split SortedIntSet into a class heirarchy to make comparisons to
FrozenIntSet more meaningful. Use Arrays.equals for more efficient
comparison. Add tests for IntSet to verify correctness.
This commit is contained in:
Mike 2020-02-06 12:16:45 -08:00 committed by GitHub
parent f3cd1dbde3
commit abd282d258
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 222 additions and 119 deletions

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.automaton;
import java.util.Arrays;
final class FrozenIntSet extends IntSet {
final int[] values;
final int hashCode;
final int state;
FrozenIntSet(int[] values, int hashCode, int state) {
this.values = values;
this.hashCode = hashCode;
this.state = state;
}
@Override
int[] getArray() {
return values;
}
@Override
int size() {
return values.length;
}
@Override
public int hashCode() {
return hashCode;
}
@Override
public String toString() {
return Arrays.toString(values);
}
}

View File

@ -0,0 +1,46 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.automaton;
import java.util.Arrays;
abstract class IntSet {
/**
* Return an array representation of this int set's values. Values are valid for indices [0, {@link #size()}).
* If this is a mutable int set, then changes to the set are not guaranteed to be visible in this array.
* @return an array containing the values for this set, guaranteed to be at least {@link #size()} elements
*/
abstract int[] getArray();
/**
* Guaranteed to be less than or equal to the length of the array returned by {@link #getArray()}.
* @return The number of values in this set.
*/
abstract int size();
@Override
public abstract int hashCode();
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof IntSet)) return false;
IntSet that = (IntSet) o;
return hashCode() == that.hashCode()
&& Arrays.equals(getArray(), 0, size(), that.getArray(), 0, that.size());
}
}

View File

@ -682,13 +682,14 @@ final public class Operations {
//System.out.println("DET:"); //System.out.println("DET:");
//a.writeDot("/l/la/lucene/core/detin.dot"); //a.writeDot("/l/la/lucene/core/detin.dot");
SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(0, 0); // Same initial values and state will always have the same hashCode
FrozenIntSet initialset = new FrozenIntSet(new int[] { 0 }, 683, 0);
// Create state 0: // Create state 0:
b.createState(); b.createState();
ArrayDeque<SortedIntSet.FrozenIntSet> worklist = new ArrayDeque<>(); ArrayDeque<FrozenIntSet> worklist = new ArrayDeque<>();
Map<SortedIntSet.FrozenIntSet,Integer> newstate = new HashMap<>(); Map<IntSet,Integer> newstate = new HashMap<>();
worklist.add(initialset); worklist.add(initialset);
@ -704,7 +705,7 @@ final public class Operations {
Transition t = new Transition(); Transition t = new Transition();
while (worklist.size() > 0) { while (worklist.size() > 0) {
SortedIntSet.FrozenIntSet s = worklist.removeFirst(); FrozenIntSet s = worklist.removeFirst();
//System.out.println("det: pop set=" + s); //System.out.println("det: pop set=" + s);
// Collate all outgoing transitions by min/1+max: // Collate all outgoing transitions by min/1+max:
@ -745,7 +746,7 @@ final public class Operations {
if (q >= maxDeterminizedStates) { if (q >= maxDeterminizedStates) {
throw new TooComplexToDeterminizeException(a, maxDeterminizedStates); throw new TooComplexToDeterminizeException(a, maxDeterminizedStates);
} }
final SortedIntSet.FrozenIntSet p = statesSet.freeze(q); final FrozenIntSet p = statesSet.freeze(q);
//System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount); //System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount);
worklist.add(p); worklist.add(p);
b.setAccept(q, accCount > 0); b.setAccept(q, accCount > 0);

View File

@ -21,9 +21,8 @@ import java.util.Map;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
// Just holds a set of int[] states, plus a corresponding // Just holds a set of int[] states, plus a corresponding
// int[] count per state. Used by // int[] count per state. Used by Operations.determinize
// BasicOperations.determinize final class SortedIntSet extends IntSet {
final class SortedIntSet {
int[] values; int[] values;
int[] counts; int[] counts;
int upto; int upto;
@ -37,9 +36,7 @@ final class SortedIntSet {
private boolean useTreeMap; private boolean useTreeMap;
int state; SortedIntSet(int capacity) {
public SortedIntSet(int capacity) {
values = new int[capacity]; values = new int[capacity];
counts = new int[capacity]; counts = new int[capacity];
} }
@ -47,13 +44,7 @@ final class SortedIntSet {
// Adds this state to the set // Adds this state to the set
public void incr(int num) { public void incr(int num) {
if (useTreeMap) { if (useTreeMap) {
final Integer key = num; map.merge(num, 1, Integer::sum);
Integer val = map.get(key);
if (val == null) {
map.put(key, 1);
} else {
map.put(key, 1+val);
}
return; return;
} }
@ -130,7 +121,7 @@ final class SortedIntSet {
assert false; assert false;
} }
public void computeHash() { void computeHash() {
if (useTreeMap) { if (useTreeMap) {
if (map.size() > values.length) { if (map.size() > values.length) {
final int size = ArrayUtil.oversize(map.size(), Integer.BYTES); final int size = ArrayUtil.oversize(map.size(), Integer.BYTES);
@ -151,41 +142,34 @@ final class SortedIntSet {
} }
} }
public FrozenIntSet freeze(int state) { /**
final int[] c = new int[upto]; * Create a snapshot of this int set associated with a given state. The snapshot will not retain any frequency
System.arraycopy(values, 0, c, 0, upto); * information about the elements of this set, only existence.
* <p>
* It is the caller's responsibility to ensure that the hashCode and data are up to date via the {@link #computeHash()} method before calling this method.
* @param state the state to associate with the frozen set.
* @return A new FrozenIntSet with the same values as this set.
*/
FrozenIntSet freeze(int state) {
final int[] c = ArrayUtil.copyOfSubArray(values, 0, upto);
return new FrozenIntSet(c, hashCode, state); return new FrozenIntSet(c, hashCode, state);
} }
@Override
int[] getArray() {
return values;
}
@Override
int size() {
return upto;
}
@Override @Override
public int hashCode() { public int hashCode() {
return hashCode; return hashCode;
} }
@Override
public boolean equals(Object _other) {
if (_other == null) {
return false;
}
if (!(_other instanceof FrozenIntSet)) {
return false;
}
FrozenIntSet other = (FrozenIntSet) _other;
if (hashCode != other.hashCode) {
return false;
}
if (other.values.length != upto) {
return false;
}
for(int i=0;i<upto;i++) {
if (other.values[i] != values[i]) {
return false;
}
}
return true;
}
@Override @Override
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder().append('['); StringBuilder sb = new StringBuilder().append('[');
@ -198,79 +182,6 @@ final class SortedIntSet {
sb.append(']'); sb.append(']');
return sb.toString(); return sb.toString();
} }
public final static class FrozenIntSet {
final int[] values;
final int hashCode;
final int state;
public FrozenIntSet(int[] values, int hashCode, int state) {
this.values = values;
this.hashCode = hashCode;
this.state = state;
}
public FrozenIntSet(int num, int state) {
this.values = new int[] {num};
this.state = state;
this.hashCode = 683+num;
}
@Override
public int hashCode() {
return hashCode;
}
@Override
public boolean equals(Object _other) {
if (_other == null) {
return false;
}
if (_other instanceof FrozenIntSet) {
FrozenIntSet other = (FrozenIntSet) _other;
if (hashCode != other.hashCode) {
return false;
}
if (other.values.length != values.length) {
return false;
}
for(int i=0;i<values.length;i++) {
if (other.values[i] != values[i]) {
return false;
}
}
return true;
} else if (_other instanceof SortedIntSet) {
SortedIntSet other = (SortedIntSet) _other;
if (hashCode != other.hashCode) {
return false;
}
if (other.values.length != values.length) {
return false;
}
for(int i=0;i<values.length;i++) {
if (other.values[i] != values[i]) {
return false;
}
}
return true;
}
return false;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder().append('[');
for(int i=0;i<values.length;i++) {
if (i > 0) {
sb.append(' ');
}
sb.append(values[i]);
}
sb.append(']');
return sb.toString();
}
}
} }

View File

@ -0,0 +1,94 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.automaton;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class TestIntSet extends LuceneTestCase {
@Test
public void testFreezeEqualitySmallSet() {
testFreezeEquality(10);
}
@Test
public void testFreezeEqualityLargeSet() {
testFreezeEquality(100);
}
private void testFreezeEquality(int size) {
SortedIntSet sortedSet = new SortedIntSet(0);
for (int i = 0; i < size; i++) {
// Some duplicates is nice but not critical
sortedSet.incr(random().nextInt(i + 1));
}
sortedSet.computeHash();
IntSet frozen0 = sortedSet.freeze(0);
assertEquals("Frozen set not equal to origin sorted set.", sortedSet, frozen0);
assertEquals("Symmetry: Sorted set not equal to frozen set.", frozen0, sortedSet);
IntSet frozen1 = sortedSet.freeze(random().nextInt());
assertEquals("Sorted set modified while freezing?", sortedSet, frozen1);
assertEquals("Frozen sets were not equal", frozen0, frozen1);
}
@Test
public void testMapCutover() {
SortedIntSet set = new SortedIntSet(10);
for (int i = 0; i < 35; i++) {
// No duplicates so there are enough elements to trigger impl cutover
set.incr(i);
}
set.computeHash();
assertTrue(set.size() > 32);
for (int i = 0; i < 35; i++) {
// This is pretty much the worst case, perf wise
set.decr(i);
}
set.computeHash();
assertTrue(set.size() == 0);
}
@Test
public void testModify() {
SortedIntSet set = new SortedIntSet(2);
set.incr(1);
set.incr(2);
set.computeHash();
FrozenIntSet set2 = set.freeze(0);
assertEquals(set, set2);
set.incr(1);
set.computeHash();
assertEquals(set, set2);
set.decr(1);
set.computeHash();
assertEquals(set, set2);
set.decr(1);
set.computeHash();
assertNotEquals(set, set2);
}
}