mirror of https://github.com/apache/lucene.git
LUCENE-9830: Hunspell: store word length for faster dictionary lookup/enumeration (#3)
This commit is contained in:
parent
42c6f780bf
commit
8913a98379
|
@ -74,6 +74,7 @@ class GeneratingSuggester {
|
||||||
};
|
};
|
||||||
|
|
||||||
dictionary.words.processAllWords(
|
dictionary.words.processAllWords(
|
||||||
|
Math.max(1, word.length() - 4),
|
||||||
word.length() + 4,
|
word.length() + 4,
|
||||||
(rootChars, forms) -> {
|
(rootChars, forms) -> {
|
||||||
speller.checkCanceled.run();
|
speller.checkCanceled.run();
|
||||||
|
|
|
@ -49,9 +49,21 @@ import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||||
* DataOutput#writeVInt} ()} VINT} format for compression.
|
* DataOutput#writeVInt} ()} VINT} format for compression.
|
||||||
*/
|
*/
|
||||||
class WordStorage {
|
class WordStorage {
|
||||||
|
private static final int OFFSET_BITS = 25;
|
||||||
|
private static final int OFFSET_MASK = (1 << OFFSET_BITS) - 1;
|
||||||
|
private static final int COLLISION_MASK = 0x40;
|
||||||
|
private static final int MAX_STORED_LENGTH = COLLISION_MASK - 1;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A map from word's hash (modulo array's length) into the offset of the last entry in {@link
|
* A map from word's hash (modulo array's length) into an int containing:
|
||||||
* #wordData} with this hash. Negated, if there's more than one entry with the same hash.
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>lower {@link #OFFSET_BITS}: the offset in {@link #wordData} of the last entry with this
|
||||||
|
* hash
|
||||||
|
* <li>the remaining highest bits: COLLISION+LENGTH info for that entry, i.e. one bit indicating
|
||||||
|
* whether there are other entries with the same hash, and the length of the entry in chars,
|
||||||
|
* or {@link #MAX_STORED_LENGTH} if the length exceeds that limit (next highest bits)
|
||||||
|
* </ul>
|
||||||
*/
|
*/
|
||||||
private final int[] hashTable;
|
private final int[] hashTable;
|
||||||
|
|
||||||
|
@ -63,17 +75,14 @@ class WordStorage {
|
||||||
* <li>VINT: a delta pointer to the entry for the same word without the last character.
|
* <li>VINT: a delta pointer to the entry for the same word without the last character.
|
||||||
* Precisely, it's the difference of this entry's start and the prefix's entry start. 0 for
|
* Precisely, it's the difference of this entry's start and the prefix's entry start. 0 for
|
||||||
* single-character entries
|
* single-character entries
|
||||||
* <li>Optional, for non-leaf entries only:
|
* <li>(Optional, for hash-colliding entries only)
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>VINT: the length of the word form data, returned from {@link #lookupWord}
|
* <li>BYTE: COLLISION+LENGTH info (see {@link #hashTable}) for the previous entry with
|
||||||
* <li>n * VINT: the word form data
|
* the same hash
|
||||||
* <li>Optional, for hash-colliding entries only:
|
* <li>VINT: (delta) pointer to the previous entry
|
||||||
* <ul>
|
|
||||||
* <li>BYTE: 1 if the next collision entry has further collisions, 0 if it's the
|
|
||||||
* last of the entries with the same hash
|
|
||||||
* <li>VINT: (delta) pointer to the previous entry with the same hash
|
|
||||||
* </ul>
|
|
||||||
* </ul>
|
* </ul>
|
||||||
|
* <li>(Optional, for non-leaf entries only) VINT+: word form data, returned from {@link
|
||||||
|
* #lookupWord}, preceded by its length
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
private final byte[] wordData;
|
private final byte[] wordData;
|
||||||
|
@ -87,13 +96,13 @@ class WordStorage {
|
||||||
assert length > 0;
|
assert length > 0;
|
||||||
|
|
||||||
int hash = Math.abs(CharsRef.stringHashCode(word, offset, length) % hashTable.length);
|
int hash = Math.abs(CharsRef.stringHashCode(word, offset, length) % hashTable.length);
|
||||||
int pos = hashTable[hash];
|
int entryCode = hashTable[hash];
|
||||||
if (pos == 0) {
|
if (entryCode == 0) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean collision = pos < 0;
|
int pos = entryCode & OFFSET_MASK;
|
||||||
pos = Math.abs(pos);
|
int mask = entryCode >>> OFFSET_BITS;
|
||||||
|
|
||||||
char lastChar = word[offset + length - 1];
|
char lastChar = word[offset + length - 1];
|
||||||
ByteArrayDataInput in = new ByteArrayDataInput(wordData);
|
ByteArrayDataInput in = new ByteArrayDataInput(wordData);
|
||||||
|
@ -101,46 +110,49 @@ class WordStorage {
|
||||||
in.setPosition(pos);
|
in.setPosition(pos);
|
||||||
char c = (char) in.readVInt();
|
char c = (char) in.readVInt();
|
||||||
int prevPos = pos - in.readVInt();
|
int prevPos = pos - in.readVInt();
|
||||||
int beforeForms = in.getPosition();
|
|
||||||
boolean found = c == lastChar && isSameString(word, offset, length - 1, prevPos, in);
|
boolean last = !hasCollision(mask);
|
||||||
if (!collision && !found) {
|
boolean mightMatch = c == lastChar && hasLength(mask, length);
|
||||||
|
|
||||||
|
if (!last) {
|
||||||
|
mask = in.readByte();
|
||||||
|
pos -= in.readVInt();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mightMatch) {
|
||||||
|
int beforeForms = in.getPosition();
|
||||||
|
if (isSameString(word, offset, length - 1, prevPos, in)) {
|
||||||
|
in.setPosition(beforeForms);
|
||||||
|
int formLength = in.readVInt();
|
||||||
|
IntsRef forms = new IntsRef(formLength);
|
||||||
|
readForms(forms, in, formLength);
|
||||||
|
return forms;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (last) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
in.setPosition(beforeForms);
|
|
||||||
int formLength = in.readVInt();
|
|
||||||
if (found) {
|
|
||||||
IntsRef forms = new IntsRef(formLength);
|
|
||||||
readForms(forms, in, formLength);
|
|
||||||
return forms;
|
|
||||||
} else {
|
|
||||||
skipVInts(in, formLength);
|
|
||||||
}
|
|
||||||
|
|
||||||
collision = in.readByte() == 1;
|
|
||||||
pos -= in.readVInt();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void skipVInts(ByteArrayDataInput in, int count) {
|
private static boolean hasCollision(int mask) {
|
||||||
for (int i = 0; i < count; ) {
|
return (mask & COLLISION_MASK) != 0;
|
||||||
if (in.readByte() >= 0) i++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param maxLength the limit on the length of words to be processed, the callback won't be
|
* Calls the processor for every dictionary entry with length between minLength and maxLength,
|
||||||
* invoked for the longer ones
|
* both ends inclusive. Note that the callback arguments (word and forms) are reused, so they can
|
||||||
* @param processor is invoked for each word. Note that the passed arguments (word and form) are
|
* be modified in any way, but may not be saved for later by the processor
|
||||||
* reused, so they can be modified in any way, but may not be saved for later by the processor
|
|
||||||
*/
|
*/
|
||||||
void processAllWords(int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
|
void processAllWords(int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
|
||||||
|
assert minLength <= maxLength;
|
||||||
CharsRef chars = new CharsRef(maxLength);
|
CharsRef chars = new CharsRef(maxLength);
|
||||||
IntsRef forms = new IntsRef();
|
IntsRef forms = new IntsRef();
|
||||||
ByteArrayDataInput in = new ByteArrayDataInput(wordData);
|
ByteArrayDataInput in = new ByteArrayDataInput(wordData);
|
||||||
for (int pos : hashTable) {
|
for (int entryCode : hashTable) {
|
||||||
boolean collision = pos < 0;
|
int pos = entryCode & OFFSET_MASK;
|
||||||
pos = Math.abs(pos);
|
int mask = entryCode >>> OFFSET_BITS;
|
||||||
|
|
||||||
while (pos != 0) {
|
while (pos != 0) {
|
||||||
int wordStart = maxLength - 1;
|
int wordStart = maxLength - 1;
|
||||||
|
@ -149,37 +161,53 @@ class WordStorage {
|
||||||
chars.chars[wordStart] = (char) in.readVInt();
|
chars.chars[wordStart] = (char) in.readVInt();
|
||||||
int prevPos = pos - in.readVInt();
|
int prevPos = pos - in.readVInt();
|
||||||
|
|
||||||
int dataLength = in.readVInt();
|
boolean last = !hasCollision(mask);
|
||||||
if (forms.ints.length < dataLength) {
|
boolean mightMatch = hasLengthInRange(mask, minLength, maxLength);
|
||||||
forms.ints = new int[dataLength];
|
|
||||||
}
|
|
||||||
readForms(forms, in, dataLength);
|
|
||||||
|
|
||||||
int afterForms = in.getPosition();
|
if (!last) {
|
||||||
|
mask = in.readByte();
|
||||||
while (prevPos != 0 && wordStart > 0) {
|
pos -= in.readVInt();
|
||||||
in.setPosition(prevPos);
|
|
||||||
chars.chars[--wordStart] = (char) in.readVInt();
|
|
||||||
prevPos -= in.readVInt();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prevPos == 0) {
|
if (mightMatch) {
|
||||||
chars.offset = wordStart;
|
int dataLength = in.readVInt();
|
||||||
chars.length = maxLength - wordStart;
|
if (forms.ints.length < dataLength) {
|
||||||
processor.accept(chars, forms);
|
forms.ints = new int[dataLength];
|
||||||
|
}
|
||||||
|
readForms(forms, in, dataLength);
|
||||||
|
while (prevPos != 0 && wordStart > 0) {
|
||||||
|
in.setPosition(prevPos);
|
||||||
|
chars.chars[--wordStart] = (char) in.readVInt();
|
||||||
|
prevPos -= in.readVInt();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prevPos == 0) {
|
||||||
|
chars.offset = wordStart;
|
||||||
|
chars.length = maxLength - wordStart;
|
||||||
|
processor.accept(chars, forms);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!collision) {
|
if (last) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
in.setPosition(afterForms);
|
|
||||||
collision = in.readVInt() == 1;
|
|
||||||
pos -= in.readVInt();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean hasLength(int mask, int length) {
|
||||||
|
int lenCode = mask & MAX_STORED_LENGTH;
|
||||||
|
return lenCode == MAX_STORED_LENGTH ? length >= MAX_STORED_LENGTH : lenCode == length;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean hasLengthInRange(int mask, int minLength, int maxLength) {
|
||||||
|
int lenCode = mask & MAX_STORED_LENGTH;
|
||||||
|
if (lenCode == MAX_STORED_LENGTH) {
|
||||||
|
return maxLength >= MAX_STORED_LENGTH;
|
||||||
|
}
|
||||||
|
return lenCode >= minLength && lenCode <= maxLength;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean isSameString(
|
private boolean isSameString(
|
||||||
char[] word, int offset, int length, int dataPos, ByteArrayDataInput in) {
|
char[] word, int offset, int length, int dataPos, ByteArrayDataInput in) {
|
||||||
for (int i = length - 1; i >= 0; i--) {
|
for (int i = length - 1; i >= 0; i--) {
|
||||||
|
@ -317,9 +345,16 @@ class WordStorage {
|
||||||
}
|
}
|
||||||
|
|
||||||
int pos = dataWriter.getPosition();
|
int pos = dataWriter.getPosition();
|
||||||
|
if (pos >= 1 << OFFSET_BITS) {
|
||||||
|
throw new RuntimeException(
|
||||||
|
"Too much word data, please report this to dev@lucene.apache.org");
|
||||||
|
}
|
||||||
int hash = Math.abs(currentEntry.hashCode() % hashTable.length);
|
int hash = Math.abs(currentEntry.hashCode() % hashTable.length);
|
||||||
int collision = hashTable[hash];
|
int prevCode = hashTable[hash];
|
||||||
hashTable[hash] = collision == 0 ? pos : -pos;
|
|
||||||
|
int mask =
|
||||||
|
(prevCode == 0 ? 0 : COLLISION_MASK) | Math.min(currentEntry.length(), MAX_STORED_LENGTH);
|
||||||
|
hashTable[hash] = (mask << OFFSET_BITS) | pos;
|
||||||
|
|
||||||
if (++chainLengths[hash] > 20) {
|
if (++chainLengths[hash] > 20) {
|
||||||
throw new RuntimeException(
|
throw new RuntimeException(
|
||||||
|
@ -329,11 +364,11 @@ class WordStorage {
|
||||||
// write the leaf entry for the last character
|
// write the leaf entry for the last character
|
||||||
dataWriter.writeVInt(currentEntry.charAt(currentEntry.length() - 1));
|
dataWriter.writeVInt(currentEntry.charAt(currentEntry.length() - 1));
|
||||||
dataWriter.writeVInt(pos - lastPos);
|
dataWriter.writeVInt(pos - lastPos);
|
||||||
IntSequenceOutputs.getSingleton().write(currentOrds.get(), dataWriter);
|
if (prevCode != 0) {
|
||||||
if (collision != 0) {
|
dataWriter.writeByte((byte) (prevCode >>> OFFSET_BITS));
|
||||||
dataWriter.writeByte(collision < 0 ? (byte) 1 : 0);
|
dataWriter.writeVInt(pos - (prevCode & OFFSET_MASK));
|
||||||
dataWriter.writeVInt(pos - Math.abs(collision));
|
|
||||||
}
|
}
|
||||||
|
IntSequenceOutputs.getSingleton().write(currentOrds.get(), dataWriter);
|
||||||
|
|
||||||
group.clear();
|
group.clear();
|
||||||
morphDataIDs.clear();
|
morphDataIDs.clear();
|
||||||
|
|
|
@ -77,20 +77,26 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
reader.lines().skip(1).map(s -> s.split("/")[0]).collect(Collectors.toSet());
|
reader.lines().skip(1).map(s -> s.split("/")[0]).collect(Collectors.toSet());
|
||||||
int maxLength = allWords.stream().mapToInt(String::length).max().orElseThrow();
|
int maxLength = allWords.stream().mapToInt(String::length).max().orElseThrow();
|
||||||
|
|
||||||
for (int i = 1; i <= maxLength + 1; i++) {
|
for (int min = 1; min <= maxLength + 1; min++) {
|
||||||
checkProcessWords(dictionary, allWords, i);
|
for (int max = min; max <= maxLength + 1; max++) {
|
||||||
|
checkProcessWords(dictionary, allWords, min, max);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkProcessWords(Dictionary dictionary, Set<String> allWords, int maxLength) {
|
private void checkProcessWords(
|
||||||
|
Dictionary dictionary, Set<String> allWords, int minLength, int maxLength) {
|
||||||
Set<String> processed = new HashSet<>();
|
Set<String> processed = new HashSet<>();
|
||||||
dictionary.words.processAllWords(maxLength, (word, __) -> processed.add(word.toString()));
|
dictionary.words.processAllWords(
|
||||||
|
minLength, maxLength, (word, __) -> processed.add(word.toString()));
|
||||||
|
|
||||||
Set<String> filtered =
|
Set<String> filtered =
|
||||||
allWords.stream().filter(s -> s.length() <= maxLength).collect(Collectors.toSet());
|
allWords.stream()
|
||||||
|
.filter(s -> minLength <= s.length() && s.length() <= maxLength)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
assertEquals("For length " + maxLength, filtered, processed);
|
assertEquals("For lengths [" + minLength + "," + maxLength + "]", filtered, processed);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCompressedDictionary() throws Exception {
|
public void testCompressedDictionary() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue