mirror of https://github.com/apache/lucene.git
LUCENE-8920: Fix bug preventing FST duplicate tails from being shared when encoded as array-with-gaps
This commit is contained in:
parent
fe1653b938
commit
e97380ad20
|
@ -195,6 +195,10 @@ public final class FST<T> implements Accountable {
|
||||||
return flag(BIT_FINAL_ARC);
|
return flag(BIT_FINAL_ARC);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isPackedArray() {
|
||||||
|
return bytesPerArc != 0 && arcIdx > Integer.MIN_VALUE;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
StringBuilder b = new StringBuilder();
|
StringBuilder b = new StringBuilder();
|
||||||
|
@ -569,7 +573,6 @@ public final class FST<T> implements Accountable {
|
||||||
return NON_FINAL_END_NODE;
|
return NON_FINAL_END_NODE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final long startAddress = builder.bytes.getPosition();
|
final long startAddress = builder.bytes.getPosition();
|
||||||
//System.out.println(" startAddr=" + startAddress);
|
//System.out.println(" startAddr=" + startAddress);
|
||||||
|
|
||||||
|
|
|
@ -41,10 +41,10 @@ final class NodeHash<T> {
|
||||||
|
|
||||||
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
|
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
|
||||||
fst.readFirstRealTargetArc(address, scratchArc, in);
|
fst.readFirstRealTargetArc(address, scratchArc, in);
|
||||||
if (scratchArc.bytesPerArc() != 0 && node.numArcs != scratchArc.numArcs()) {
|
if (scratchArc.isPackedArray() && node.numArcs != scratchArc.numArcs()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for(int arcUpto=0;arcUpto<node.numArcs;arcUpto++) {
|
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
|
||||||
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
||||||
if (arc.label != scratchArc.label() ||
|
if (arc.label != scratchArc.label() ||
|
||||||
!arc.output.equals(scratchArc.output()) ||
|
!arc.output.equals(scratchArc.output()) ||
|
||||||
|
@ -170,4 +170,5 @@ final class NodeHash<T> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,8 +20,10 @@ import java.nio.file.Files;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
import org.apache.lucene.store.DataInput;
|
import org.apache.lucene.store.DataInput;
|
||||||
|
@ -33,34 +35,47 @@ import org.junit.Before;
|
||||||
|
|
||||||
public class TestFstDirect extends LuceneTestCase {
|
public class TestFstDirect extends LuceneTestCase {
|
||||||
|
|
||||||
private List<String> words;
|
|
||||||
|
|
||||||
@Before
|
|
||||||
public void before() {
|
|
||||||
words = new ArrayList<>();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testDenseWithGap() throws Exception {
|
public void testDenseWithGap() throws Exception {
|
||||||
//words.addAll(Arrays.asList("apple", "berry", "cherry", "damson", "fig", "grape"));
|
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
|
||||||
words.addAll(Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm"));
|
List<BytesRef> entries = new ArrayList<>();
|
||||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(words));
|
|
||||||
for (String word : words) {
|
for (String word : words) {
|
||||||
assertNotNull(word + " not found", fstEnum.seekExact(new BytesRef(word)));
|
entries.add(new BytesRef(word.getBytes("ascii")));
|
||||||
|
}
|
||||||
|
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(entries));
|
||||||
|
for (BytesRef entry : entries) {
|
||||||
|
assertNotNull(entry.utf8ToString() + " not found", fstEnum.seekExact(entry));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testDeDupTails() throws Exception {
|
||||||
|
List<BytesRef> entries = new ArrayList<>();
|
||||||
|
for (int i = 0; i < 1000000; i += 4) {
|
||||||
|
byte[] b = new byte[3];
|
||||||
|
int val = i;
|
||||||
|
for (int j = b.length - 1; j >= 0; --j) {
|
||||||
|
b[j] = (byte) (val & 0xff);
|
||||||
|
val >>= 8;
|
||||||
|
}
|
||||||
|
entries.add(new BytesRef(b));
|
||||||
|
}
|
||||||
|
long size = buildFST(entries).ramBytesUsed();
|
||||||
|
// Size is 1664 when we use only list-encoding. We were previously failing to ever de-dup
|
||||||
|
// arrays-with-gaps, which led this case to blow up.
|
||||||
|
assertTrue(size < 3000);
|
||||||
|
//printf("fst size = %d bytes", size);
|
||||||
|
}
|
||||||
|
|
||||||
private FST<Object> buildFST(List<String> words) throws Exception {
|
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
|
||||||
long start = System.nanoTime();
|
|
||||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||||
|
BytesRef last = null;
|
||||||
for (String word : words) {
|
for (BytesRef entry : entries) {
|
||||||
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
|
if (entry.equals(last) == false) {
|
||||||
|
b.add(Util.toIntsRef(entry, new IntsRefBuilder()), outputs.getNoOutput());
|
||||||
|
}
|
||||||
|
last = entry;
|
||||||
}
|
}
|
||||||
FST<Object> fst = b.finish();
|
FST<Object> fst = b.finish();
|
||||||
long t = System.nanoTime();
|
|
||||||
printf("Built FST of %d bytes in %d ms", fst.ramBytesUsed(), nsToMs(t - start));
|
|
||||||
return fst;
|
return fst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue