LUCENE-8920: Fix bug preventing FST duplicate tails from being shared when encoded as array-with-gaps

This commit is contained in:
Michael Sokolov 2019-07-21 11:39:39 -04:00 committed by Adrien Grand
parent fe1653b938
commit e97380ad20
3 changed files with 40 additions and 21 deletions

View File

@ -195,6 +195,10 @@ public final class FST<T> implements Accountable {
return flag(BIT_FINAL_ARC);
}
public boolean isPackedArray() {
return bytesPerArc != 0 && arcIdx > Integer.MIN_VALUE;
}
@Override
public String toString() {
StringBuilder b = new StringBuilder();
@ -569,7 +573,6 @@ public final class FST<T> implements Accountable {
return NON_FINAL_END_NODE;
}
}
final long startAddress = builder.bytes.getPosition();
//System.out.println(" startAddr=" + startAddress);

View File

@ -41,10 +41,10 @@ final class NodeHash<T> {
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
fst.readFirstRealTargetArc(address, scratchArc, in);
if (scratchArc.bytesPerArc() != 0 && node.numArcs != scratchArc.numArcs()) {
if (scratchArc.isPackedArray() && node.numArcs != scratchArc.numArcs()) {
return false;
}
for(int arcUpto=0;arcUpto<node.numArcs;arcUpto++) {
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
final Builder.Arc<T> arc = node.arcs[arcUpto];
if (arc.label != scratchArc.label() ||
!arc.output.equals(scratchArc.output()) ||
@ -170,4 +170,5 @@ final class NodeHash<T> {
}
}
}
}

View File

@ -20,8 +20,10 @@ import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
@ -33,34 +35,47 @@ import org.junit.Before;
public class TestFstDirect extends LuceneTestCase {
private List<String> words;
@Before
public void before() {
words = new ArrayList<>();
}
public void testDenseWithGap() throws Exception {
//words.addAll(Arrays.asList("apple", "berry", "cherry", "damson", "fig", "grape"));
words.addAll(Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm"));
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(words));
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
List<BytesRef> entries = new ArrayList<>();
for (String word : words) {
assertNotNull(word + " not found", fstEnum.seekExact(new BytesRef(word)));
entries.add(new BytesRef(word.getBytes("ascii")));
}
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(entries));
for (BytesRef entry : entries) {
assertNotNull(entry.utf8ToString() + " not found", fstEnum.seekExact(entry));
}
}
public void testDeDupTails() throws Exception {
List<BytesRef> entries = new ArrayList<>();
for (int i = 0; i < 1000000; i += 4) {
byte[] b = new byte[3];
int val = i;
for (int j = b.length - 1; j >= 0; --j) {
b[j] = (byte) (val & 0xff);
val >>= 8;
}
entries.add(new BytesRef(b));
}
long size = buildFST(entries).ramBytesUsed();
// Size is 1664 when we use only list-encoding. We were previously failing to ever de-dup
// arrays-with-gaps, which led this case to blow up.
assertTrue(size < 3000);
//printf("fst size = %d bytes", size);
}
private FST<Object> buildFST(List<String> words) throws Exception {
long start = System.nanoTime();
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
final Outputs<Object> outputs = NoOutputs.getSingleton();
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
for (String word : words) {
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
BytesRef last = null;
for (BytesRef entry : entries) {
if (entry.equals(last) == false) {
b.add(Util.toIntsRef(entry, new IntsRefBuilder()), outputs.getNoOutput());
}
last = entry;
}
FST<Object> fst = b.finish();
long t = System.nanoTime();
printf("Built FST of %d bytes in %d ms", fst.ramBytesUsed(), nsToMs(t - start));
return fst;
}