mirror of https://github.com/apache/lucene.git
LUCENE-8920: Fix bug preventing FST duplicate tails from being shared when encoded as array-with-gaps
This commit is contained in:
parent
d9d16eec95
commit
d1706b36ba
|
@ -197,6 +197,10 @@ public final class FST<T> implements Accountable {
|
|||
return flag(BIT_FINAL_ARC);
|
||||
}
|
||||
|
||||
public boolean isPackedArray() {
|
||||
return bytesPerArc != 0 && arcIdx > Integer.MIN_VALUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
|
@ -569,7 +573,6 @@ public final class FST<T> implements Accountable {
|
|||
return NON_FINAL_END_NODE;
|
||||
}
|
||||
}
|
||||
|
||||
final long startAddress = builder.bytes.getPosition();
|
||||
//System.out.println(" startAddr=" + startAddress);
|
||||
|
||||
|
@ -685,7 +688,6 @@ public final class FST<T> implements Accountable {
|
|||
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
|
||||
boolean writeDirectly = labelRange > 0 && labelRange < Builder.DIRECT_ARC_LOAD_FACTOR * nodeIn.numArcs;
|
||||
|
||||
//System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs);
|
||||
// create the header
|
||||
// TODO: clean this up: or just rewind+reuse and deal with it
|
||||
byte[] header = new byte[MAX_HEADER_SIZE];
|
||||
|
|
|
@ -41,7 +41,7 @@ final class NodeHash<T> {
|
|||
|
||||
private boolean nodesEqual(Builder.UnCompiledNode<T> node, long address) throws IOException {
|
||||
fst.readFirstRealTargetArc(address, scratchArc, in);
|
||||
if (scratchArc.bytesPerArc() != 0 && node.numArcs != scratchArc.numArcs()) {
|
||||
if (scratchArc.isPackedArray() && node.numArcs != scratchArc.numArcs()) {
|
||||
return false;
|
||||
}
|
||||
for(int arcUpto=0; arcUpto < node.numArcs; arcUpto++) {
|
||||
|
@ -170,4 +170,5 @@ final class NodeHash<T> {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,8 +20,10 @@ import java.nio.file.Files;
|
|||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
|
@ -33,34 +35,47 @@ import org.junit.Before;
|
|||
|
||||
public class TestFstDirect extends LuceneTestCase {
|
||||
|
||||
private List<String> words;
|
||||
|
||||
@Before
|
||||
public void before() {
|
||||
words = new ArrayList<>();
|
||||
}
|
||||
|
||||
public void testDenseWithGap() throws Exception {
|
||||
//words.addAll(Arrays.asList("apple", "berry", "cherry", "damson", "fig", "grape"));
|
||||
words.addAll(Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm"));
|
||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(words));
|
||||
List<String> words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm");
|
||||
List<BytesRef> entries = new ArrayList<>();
|
||||
for (String word : words) {
|
||||
assertNotNull(word + " not found", fstEnum.seekExact(new BytesRef(word)));
|
||||
entries.add(new BytesRef(word.getBytes("ascii")));
|
||||
}
|
||||
final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(buildFST(entries));
|
||||
for (BytesRef entry : entries) {
|
||||
assertNotNull(entry.utf8ToString() + " not found", fstEnum.seekExact(entry));
|
||||
}
|
||||
}
|
||||
|
||||
public void testDeDupTails() throws Exception {
|
||||
List<BytesRef> entries = new ArrayList<>();
|
||||
for (int i = 0; i < 1000000; i += 4) {
|
||||
byte[] b = new byte[3];
|
||||
int val = i;
|
||||
for (int j = b.length - 1; j >= 0; --j) {
|
||||
b[j] = (byte) (val & 0xff);
|
||||
val >>= 8;
|
||||
}
|
||||
entries.add(new BytesRef(b));
|
||||
}
|
||||
long size = buildFST(entries).ramBytesUsed();
|
||||
// Size is 1664 when we use only list-encoding. We were previously failing to ever de-dup
|
||||
// arrays-with-gaps, which led this case to blow up.
|
||||
assertTrue(size < 3000);
|
||||
//printf("fst size = %d bytes", size);
|
||||
}
|
||||
|
||||
private FST<Object> buildFST(List<String> words) throws Exception {
|
||||
long start = System.nanoTime();
|
||||
private FST<Object> buildFST(List<BytesRef> entries) throws Exception {
|
||||
final Outputs<Object> outputs = NoOutputs.getSingleton();
|
||||
final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
|
||||
|
||||
for (String word : words) {
|
||||
b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput());
|
||||
BytesRef last = null;
|
||||
for (BytesRef entry : entries) {
|
||||
if (entry.equals(last) == false) {
|
||||
b.add(Util.toIntsRef(entry, new IntsRefBuilder()), outputs.getNoOutput());
|
||||
}
|
||||
last = entry;
|
||||
}
|
||||
FST<Object> fst = b.finish();
|
||||
long t = System.nanoTime();
|
||||
printf("Built FST of %d bytes in %d ms", fst.ramBytesUsed(), nsToMs(t - start));
|
||||
return fst;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue