Ensure Nori/Kuromoji shipped binary FST is the latest version (#12933)

* ensure Nori/Kuromoji shipped binary FST is the latest version (closes #12911)

* fold feedback from @uschindler: sharpen test failure methods to give the specific gradlew command to regenerate the precise FST (not everything)

* add javadoc for FSTMetadata.getVersion
This commit is contained in:
Michael McCandless 2023-12-14 07:38:34 -05:00 committed by GitHub
parent 3965319441
commit ebf9e29570
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 62 additions and 3 deletions

View File

@ -111,7 +111,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
this.fst = new TokenInfoFST(fst, true);
}
private static InputStream getClassResource(String suffix) throws IOException {
static InputStream getClassResource(String suffix) throws IOException {
final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix;
return IOUtils.requireResourceNonNull(
TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath);

View File

@ -21,18 +21,22 @@ import static org.apache.lucene.analysis.morph.BinaryDictionary.DICT_FILENAME_SU
import static org.apache.lucene.analysis.morph.BinaryDictionary.POSDICT_FILENAME_SUFFIX;
import static org.apache.lucene.analysis.morph.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntsRefFSTEnum;
import org.apache.lucene.util.fst.PositiveIntOutputs;
/** Tests of TokenInfoDictionary build tools; run using ant test-tools */
public class TestTokenInfoDictionary extends LuceneTestCase {
@ -178,4 +182,25 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
}
}
// #12911: make sure our shipped binary FST for TokenInfoDictionary is the latest & greatest
// format
public void testBinaryFSTIsLatestFormat() throws Exception {
try (InputStream is =
new BufferedInputStream(
TokenInfoDictionary.getClassResource(TokenInfoDictionary.FST_FILENAME_SUFFIX))) {
// we only need to load the FSTMetadata to check version:
int actualVersion =
FST.readMetadata(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton())
.getVersion();
assertEquals(
"TokenInfoDictionary's FST is not the latest version: expected "
+ FST.VERSION_CURRENT
+ " but got: "
+ actualVersion
+ "; run \"./gradlew :lucene:analysis:kuromoji:regenerate\" to regenerate this FST",
FST.VERSION_CURRENT,
actualVersion);
}
}
}

View File

@ -109,7 +109,7 @@ public final class TokenInfoDictionary extends BinaryDictionary<TokenInfoMorphDa
this.fst = new TokenInfoFST(fst);
}
private static InputStream getClassResource(String suffix) throws IOException {
static InputStream getClassResource(String suffix) throws IOException {
final String resourcePath = TokenInfoDictionary.class.getSimpleName() + suffix;
return IOUtils.requireResourceNonNull(
TokenInfoDictionary.class.getResourceAsStream(resourcePath), resourcePath);

View File

@ -21,6 +21,8 @@ import static org.apache.lucene.analysis.morph.BinaryDictionary.DICT_FILENAME_SU
import static org.apache.lucene.analysis.morph.BinaryDictionary.POSDICT_FILENAME_SUFFIX;
import static org.apache.lucene.analysis.morph.BinaryDictionary.TARGETMAP_FILENAME_SUFFIX;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
@ -28,12 +30,14 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntsRefFSTEnum;
import org.apache.lucene.util.fst.PositiveIntOutputs;
/** Tests of TokenInfoDictionary build tools; run using ant test-tools */
public class TestTokenInfoDictionary extends LuceneTestCase {
@ -185,4 +189,25 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
}
}
// #12911: make sure our shipped binary FST for TokenInfoDictionary is the latest & greatest
// format
public void testBinaryFSTIsLatestFormat() throws Exception {
try (InputStream is =
new BufferedInputStream(
TokenInfoDictionary.getClassResource(TokenInfoDictionary.FST_FILENAME_SUFFIX))) {
// we only need to load the FSTMetadata to check version:
int actualVersion =
FST.readMetadata(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton())
.getVersion();
assertEquals(
"TokenInfoDictionary's FST is not the latest version: expected "
+ FST.VERSION_CURRENT
+ " but got: "
+ actualVersion
+ "; run \"./gradlew :lucene:analysis:nori:regenerate\" to regenerate this FST",
FST.VERSION_CURRENT,
actualVersion);
}
}
}

View File

@ -1208,7 +1208,7 @@ public final class FST<T> implements Accountable {
}
/**
* Represent the FST metadata
* Represents the FST metadata.
*
* @param <T> the FST output type
*/
@ -1236,5 +1236,14 @@ public final class FST<T> implements Accountable {
this.version = version;
this.numBytes = numBytes;
}
/**
* Returns the version constant of the binary format this FST was written in. See the {@code
* static final int VERSION} constants in FST's javadoc, e.g. {@link
* FST#VERSION_CONTINUOUS_ARCS}.
*/
public int getVersion() {
return version;
}
}
}