mirror of https://github.com/apache/lucene.git
LUCENE-9142 Refactor IntSet operations for determinize (#1184)
* LUCENE-9142 Refactor SortedIntSet for equality Split SortedIntSet into a class heirarchy to make comparisons to FrozenIntSet more meaningful. Use Arrays.equals for more efficient comparison. Add tests for IntSet to verify correctness.
This commit is contained in:
parent
f3cd1dbde3
commit
abd282d258
|
@ -0,0 +1,51 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.util.automaton;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
final class FrozenIntSet extends IntSet {
|
||||||
|
final int[] values;
|
||||||
|
final int hashCode;
|
||||||
|
final int state;
|
||||||
|
|
||||||
|
FrozenIntSet(int[] values, int hashCode, int state) {
|
||||||
|
this.values = values;
|
||||||
|
this.hashCode = hashCode;
|
||||||
|
this.state = state;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int[] getArray() {
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int size() {
|
||||||
|
return values.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return hashCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return Arrays.toString(values);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.util.automaton;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
abstract class IntSet {
|
||||||
|
/**
|
||||||
|
* Return an array representation of this int set's values. Values are valid for indices [0, {@link #size()}).
|
||||||
|
* If this is a mutable int set, then changes to the set are not guaranteed to be visible in this array.
|
||||||
|
* @return an array containing the values for this set, guaranteed to be at least {@link #size()} elements
|
||||||
|
*/
|
||||||
|
abstract int[] getArray();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Guaranteed to be less than or equal to the length of the array returned by {@link #getArray()}.
|
||||||
|
* @return The number of values in this set.
|
||||||
|
*/
|
||||||
|
abstract int size();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public abstract int hashCode();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) return true;
|
||||||
|
if (!(o instanceof IntSet)) return false;
|
||||||
|
IntSet that = (IntSet) o;
|
||||||
|
return hashCode() == that.hashCode()
|
||||||
|
&& Arrays.equals(getArray(), 0, size(), that.getArray(), 0, that.size());
|
||||||
|
}
|
||||||
|
}
|
|
@ -682,13 +682,14 @@ final public class Operations {
|
||||||
//System.out.println("DET:");
|
//System.out.println("DET:");
|
||||||
//a.writeDot("/l/la/lucene/core/detin.dot");
|
//a.writeDot("/l/la/lucene/core/detin.dot");
|
||||||
|
|
||||||
SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(0, 0);
|
// Same initial values and state will always have the same hashCode
|
||||||
|
FrozenIntSet initialset = new FrozenIntSet(new int[] { 0 }, 683, 0);
|
||||||
|
|
||||||
// Create state 0:
|
// Create state 0:
|
||||||
b.createState();
|
b.createState();
|
||||||
|
|
||||||
ArrayDeque<SortedIntSet.FrozenIntSet> worklist = new ArrayDeque<>();
|
ArrayDeque<FrozenIntSet> worklist = new ArrayDeque<>();
|
||||||
Map<SortedIntSet.FrozenIntSet,Integer> newstate = new HashMap<>();
|
Map<IntSet,Integer> newstate = new HashMap<>();
|
||||||
|
|
||||||
worklist.add(initialset);
|
worklist.add(initialset);
|
||||||
|
|
||||||
|
@ -704,7 +705,7 @@ final public class Operations {
|
||||||
Transition t = new Transition();
|
Transition t = new Transition();
|
||||||
|
|
||||||
while (worklist.size() > 0) {
|
while (worklist.size() > 0) {
|
||||||
SortedIntSet.FrozenIntSet s = worklist.removeFirst();
|
FrozenIntSet s = worklist.removeFirst();
|
||||||
//System.out.println("det: pop set=" + s);
|
//System.out.println("det: pop set=" + s);
|
||||||
|
|
||||||
// Collate all outgoing transitions by min/1+max:
|
// Collate all outgoing transitions by min/1+max:
|
||||||
|
@ -745,7 +746,7 @@ final public class Operations {
|
||||||
if (q >= maxDeterminizedStates) {
|
if (q >= maxDeterminizedStates) {
|
||||||
throw new TooComplexToDeterminizeException(a, maxDeterminizedStates);
|
throw new TooComplexToDeterminizeException(a, maxDeterminizedStates);
|
||||||
}
|
}
|
||||||
final SortedIntSet.FrozenIntSet p = statesSet.freeze(q);
|
final FrozenIntSet p = statesSet.freeze(q);
|
||||||
//System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount);
|
//System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount);
|
||||||
worklist.add(p);
|
worklist.add(p);
|
||||||
b.setAccept(q, accCount > 0);
|
b.setAccept(q, accCount > 0);
|
||||||
|
|
|
@ -21,9 +21,8 @@ import java.util.Map;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
|
||||||
// Just holds a set of int[] states, plus a corresponding
|
// Just holds a set of int[] states, plus a corresponding
|
||||||
// int[] count per state. Used by
|
// int[] count per state. Used by Operations.determinize
|
||||||
// BasicOperations.determinize
|
final class SortedIntSet extends IntSet {
|
||||||
final class SortedIntSet {
|
|
||||||
int[] values;
|
int[] values;
|
||||||
int[] counts;
|
int[] counts;
|
||||||
int upto;
|
int upto;
|
||||||
|
@ -37,9 +36,7 @@ final class SortedIntSet {
|
||||||
|
|
||||||
private boolean useTreeMap;
|
private boolean useTreeMap;
|
||||||
|
|
||||||
int state;
|
SortedIntSet(int capacity) {
|
||||||
|
|
||||||
public SortedIntSet(int capacity) {
|
|
||||||
values = new int[capacity];
|
values = new int[capacity];
|
||||||
counts = new int[capacity];
|
counts = new int[capacity];
|
||||||
}
|
}
|
||||||
|
@ -47,13 +44,7 @@ final class SortedIntSet {
|
||||||
// Adds this state to the set
|
// Adds this state to the set
|
||||||
public void incr(int num) {
|
public void incr(int num) {
|
||||||
if (useTreeMap) {
|
if (useTreeMap) {
|
||||||
final Integer key = num;
|
map.merge(num, 1, Integer::sum);
|
||||||
Integer val = map.get(key);
|
|
||||||
if (val == null) {
|
|
||||||
map.put(key, 1);
|
|
||||||
} else {
|
|
||||||
map.put(key, 1+val);
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -130,7 +121,7 @@ final class SortedIntSet {
|
||||||
assert false;
|
assert false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void computeHash() {
|
void computeHash() {
|
||||||
if (useTreeMap) {
|
if (useTreeMap) {
|
||||||
if (map.size() > values.length) {
|
if (map.size() > values.length) {
|
||||||
final int size = ArrayUtil.oversize(map.size(), Integer.BYTES);
|
final int size = ArrayUtil.oversize(map.size(), Integer.BYTES);
|
||||||
|
@ -151,41 +142,34 @@ final class SortedIntSet {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public FrozenIntSet freeze(int state) {
|
/**
|
||||||
final int[] c = new int[upto];
|
* Create a snapshot of this int set associated with a given state. The snapshot will not retain any frequency
|
||||||
System.arraycopy(values, 0, c, 0, upto);
|
* information about the elements of this set, only existence.
|
||||||
|
* <p>
|
||||||
|
* It is the caller's responsibility to ensure that the hashCode and data are up to date via the {@link #computeHash()} method before calling this method.
|
||||||
|
* @param state the state to associate with the frozen set.
|
||||||
|
* @return A new FrozenIntSet with the same values as this set.
|
||||||
|
*/
|
||||||
|
FrozenIntSet freeze(int state) {
|
||||||
|
final int[] c = ArrayUtil.copyOfSubArray(values, 0, upto);
|
||||||
return new FrozenIntSet(c, hashCode, state);
|
return new FrozenIntSet(c, hashCode, state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int[] getArray() {
|
||||||
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int size() {
|
||||||
|
return upto;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return hashCode;
|
return hashCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object _other) {
|
|
||||||
if (_other == null) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!(_other instanceof FrozenIntSet)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
FrozenIntSet other = (FrozenIntSet) _other;
|
|
||||||
if (hashCode != other.hashCode) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (other.values.length != upto) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
for(int i=0;i<upto;i++) {
|
|
||||||
if (other.values[i] != values[i]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
StringBuilder sb = new StringBuilder().append('[');
|
StringBuilder sb = new StringBuilder().append('[');
|
||||||
|
@ -198,79 +182,6 @@ final class SortedIntSet {
|
||||||
sb.append(']');
|
sb.append(']');
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public final static class FrozenIntSet {
|
|
||||||
final int[] values;
|
|
||||||
final int hashCode;
|
|
||||||
final int state;
|
|
||||||
|
|
||||||
public FrozenIntSet(int[] values, int hashCode, int state) {
|
|
||||||
this.values = values;
|
|
||||||
this.hashCode = hashCode;
|
|
||||||
this.state = state;
|
|
||||||
}
|
|
||||||
|
|
||||||
public FrozenIntSet(int num, int state) {
|
|
||||||
this.values = new int[] {num};
|
|
||||||
this.state = state;
|
|
||||||
this.hashCode = 683+num;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
return hashCode;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object _other) {
|
|
||||||
if (_other == null) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (_other instanceof FrozenIntSet) {
|
|
||||||
FrozenIntSet other = (FrozenIntSet) _other;
|
|
||||||
if (hashCode != other.hashCode) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (other.values.length != values.length) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
for(int i=0;i<values.length;i++) {
|
|
||||||
if (other.values[i] != values[i]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
} else if (_other instanceof SortedIntSet) {
|
|
||||||
SortedIntSet other = (SortedIntSet) _other;
|
|
||||||
if (hashCode != other.hashCode) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (other.values.length != values.length) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
for(int i=0;i<values.length;i++) {
|
|
||||||
if (other.values[i] != values[i]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
StringBuilder sb = new StringBuilder().append('[');
|
|
||||||
for(int i=0;i<values.length;i++) {
|
|
||||||
if (i > 0) {
|
|
||||||
sb.append(' ');
|
|
||||||
}
|
|
||||||
sb.append(values[i]);
|
|
||||||
}
|
|
||||||
sb.append(']');
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,94 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.util.automaton;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestIntSet extends LuceneTestCase {
|
||||||
|
@Test
|
||||||
|
public void testFreezeEqualitySmallSet() {
|
||||||
|
testFreezeEquality(10);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFreezeEqualityLargeSet() {
|
||||||
|
testFreezeEquality(100);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testFreezeEquality(int size) {
|
||||||
|
SortedIntSet sortedSet = new SortedIntSet(0);
|
||||||
|
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
// Some duplicates is nice but not critical
|
||||||
|
sortedSet.incr(random().nextInt(i + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
sortedSet.computeHash();
|
||||||
|
IntSet frozen0 = sortedSet.freeze(0);
|
||||||
|
|
||||||
|
assertEquals("Frozen set not equal to origin sorted set.", sortedSet, frozen0);
|
||||||
|
assertEquals("Symmetry: Sorted set not equal to frozen set.", frozen0, sortedSet);
|
||||||
|
|
||||||
|
IntSet frozen1 = sortedSet.freeze(random().nextInt());
|
||||||
|
assertEquals("Sorted set modified while freezing?", sortedSet, frozen1);
|
||||||
|
assertEquals("Frozen sets were not equal", frozen0, frozen1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMapCutover() {
|
||||||
|
SortedIntSet set = new SortedIntSet(10);
|
||||||
|
for (int i = 0; i < 35; i++) {
|
||||||
|
// No duplicates so there are enough elements to trigger impl cutover
|
||||||
|
set.incr(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
set.computeHash();
|
||||||
|
assertTrue(set.size() > 32);
|
||||||
|
|
||||||
|
for (int i = 0; i < 35; i++) {
|
||||||
|
// This is pretty much the worst case, perf wise
|
||||||
|
set.decr(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
set.computeHash();
|
||||||
|
assertTrue(set.size() == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testModify() {
|
||||||
|
SortedIntSet set = new SortedIntSet(2);
|
||||||
|
set.incr(1);
|
||||||
|
set.incr(2);
|
||||||
|
set.computeHash();
|
||||||
|
|
||||||
|
FrozenIntSet set2 = set.freeze(0);
|
||||||
|
assertEquals(set, set2);
|
||||||
|
|
||||||
|
set.incr(1);
|
||||||
|
set.computeHash();
|
||||||
|
assertEquals(set, set2);
|
||||||
|
|
||||||
|
set.decr(1);
|
||||||
|
set.computeHash();
|
||||||
|
assertEquals(set, set2);
|
||||||
|
|
||||||
|
set.decr(1);
|
||||||
|
set.computeHash();
|
||||||
|
assertNotEquals(set, set2);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue