From e97380ad20d5f6af3a90c37383468678cf6bfcc7 Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Sun, 21 Jul 2019 11:39:39 -0400 Subject: [PATCH] LUCENE-8920: Fix bug preventing FST duplicate tails from being shared when encoded as array-with-gaps --- .../java/org/apache/lucene/util/fst/FST.java | 5 +- .../org/apache/lucene/util/fst/NodeHash.java | 5 +- .../apache/lucene/util/fst/TestFstDirect.java | 51 ++++++++++++------- 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index e8aac93a95f..3389343f02d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -195,6 +195,10 @@ public final class FST implements Accountable { return flag(BIT_FINAL_ARC); } + public boolean isPackedArray() { + return bytesPerArc != 0 && arcIdx > Integer.MIN_VALUE; + } + @Override public String toString() { StringBuilder b = new StringBuilder(); @@ -569,7 +573,6 @@ public final class FST implements Accountable { return NON_FINAL_END_NODE; } } - final long startAddress = builder.bytes.getPosition(); //System.out.println(" startAddr=" + startAddress); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java index 0de8c97b136..8530ccbdc79 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -41,10 +41,10 @@ final class NodeHash { private boolean nodesEqual(Builder.UnCompiledNode node, long address) throws IOException { fst.readFirstRealTargetArc(address, scratchArc, in); - if (scratchArc.bytesPerArc() != 0 && node.numArcs != scratchArc.numArcs()) { + if (scratchArc.isPackedArray() && node.numArcs != scratchArc.numArcs()) { return false; } - for(int arcUpto=0;arcUpto arc = node.arcs[arcUpto]; if (arc.label != scratchArc.label() || !arc.output.equals(scratchArc.output()) || @@ -170,4 +170,5 @@ final class NodeHash { } } } + } diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFstDirect.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFstDirect.java index 8763d0416d7..a87aae9811e 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFstDirect.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFstDirect.java @@ -20,8 +20,10 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Locale; +import java.util.stream.Collectors; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.DataInput; @@ -33,34 +35,47 @@ import org.junit.Before; public class TestFstDirect extends LuceneTestCase { - private List words; - - @Before - public void before() { - words = new ArrayList<>(); - } - public void testDenseWithGap() throws Exception { - //words.addAll(Arrays.asList("apple", "berry", "cherry", "damson", "fig", "grape")); - words.addAll(Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm")); - final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum<>(buildFST(words)); + List words = Arrays.asList("ah", "bi", "cj", "dk", "fl", "gm"); + List entries = new ArrayList<>(); for (String word : words) { - assertNotNull(word + " not found", fstEnum.seekExact(new BytesRef(word))); + entries.add(new BytesRef(word.getBytes("ascii"))); + } + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum<>(buildFST(entries)); + for (BytesRef entry : entries) { + assertNotNull(entry.utf8ToString() + " not found", fstEnum.seekExact(entry)); } } + public void testDeDupTails() throws Exception { + List entries = new ArrayList<>(); + for (int i = 0; i < 1000000; i += 4) { + byte[] b = new byte[3]; + int val = i; + for (int j = b.length - 1; j >= 0; --j) { + b[j] = (byte) (val & 0xff); + val >>= 8; + } + entries.add(new BytesRef(b)); + } + long size = buildFST(entries).ramBytesUsed(); + // Size is 1664 when we use only list-encoding. We were previously failing to ever de-dup + // arrays-with-gaps, which led this case to blow up. + assertTrue(size < 3000); + //printf("fst size = %d bytes", size); + } - private FST buildFST(List words) throws Exception { - long start = System.nanoTime(); + private FST buildFST(List entries) throws Exception { final Outputs outputs = NoOutputs.getSingleton(); final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); - - for (String word : words) { - b.add(Util.toIntsRef(new BytesRef(word), new IntsRefBuilder()), outputs.getNoOutput()); + BytesRef last = null; + for (BytesRef entry : entries) { + if (entry.equals(last) == false) { + b.add(Util.toIntsRef(entry, new IntsRefBuilder()), outputs.getNoOutput()); + } + last = entry; } FST fst = b.finish(); - long t = System.nanoTime(); - printf("Built FST of %d bytes in %d ms", fst.ramBytesUsed(), nsToMs(t - start)); return fst; }