mirror of https://github.com/apache/lucene.git
LUCENE-9716: Hunspell: support flag usage before its format is even specified (#2277)
This commit is contained in:
parent
47e3d06ce0
commit
8f75933f3d
|
@ -17,7 +17,6 @@
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
import java.io.BufferedOutputStream;
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
@ -45,8 +44,6 @@ import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IOContext;
|
import org.apache.lucene.store.IOContext;
|
||||||
|
@ -84,6 +81,7 @@ public class Dictionary {
|
||||||
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||||
static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
|
static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
|
||||||
|
CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
|
||||||
|
|
||||||
FST<IntsRef> prefixes;
|
FST<IntsRef> prefixes;
|
||||||
FST<IntsRef> suffixes;
|
FST<IntsRef> suffixes;
|
||||||
|
@ -212,25 +210,21 @@ public class Dictionary {
|
||||||
|
|
||||||
Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
|
Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
|
||||||
Path aff = Files.createTempFile(tempPath, "affix", "aff");
|
Path aff = Files.createTempFile(tempPath, "affix", "aff");
|
||||||
OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
|
|
||||||
InputStream aff1 = null;
|
BufferedInputStream aff1 = null;
|
||||||
InputStream aff2 = null;
|
InputStream aff2 = null;
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
// copy contents of affix stream to temp file
|
// Copy contents of the affix stream to a temp file.
|
||||||
final byte[] buffer = new byte[1024 * 8];
|
try (OutputStream os = Files.newOutputStream(aff)) {
|
||||||
int len;
|
affix.transferTo(os);
|
||||||
while ((len = affix.read(buffer)) > 0) {
|
|
||||||
out.write(buffer, 0, len);
|
|
||||||
}
|
}
|
||||||
out.close();
|
|
||||||
|
|
||||||
// pass 1: get encoding
|
// pass 1: get encoding & flag
|
||||||
aff1 = new BufferedInputStream(Files.newInputStream(aff));
|
aff1 = new BufferedInputStream(Files.newInputStream(aff));
|
||||||
String encoding = getDictionaryEncoding(aff1);
|
readConfig(aff1);
|
||||||
|
|
||||||
// pass 2: parse affixes
|
// pass 2: parse affixes
|
||||||
CharsetDecoder decoder = getJavaEncoding(encoding);
|
|
||||||
aff2 = new BufferedInputStream(Files.newInputStream(aff));
|
aff2 = new BufferedInputStream(Files.newInputStream(aff));
|
||||||
readAffixFile(aff2, decoder);
|
readAffixFile(aff2, decoder);
|
||||||
|
|
||||||
|
@ -242,7 +236,7 @@ public class Dictionary {
|
||||||
morphAliases = null; // no longer needed
|
morphAliases = null; // no longer needed
|
||||||
success = true;
|
success = true;
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.closeWhileHandlingException(out, aff1, aff2);
|
IOUtils.closeWhileHandlingException(aff1, aff2);
|
||||||
if (success) {
|
if (success) {
|
||||||
Files.delete(aff);
|
Files.delete(aff);
|
||||||
} else {
|
} else {
|
||||||
|
@ -344,10 +338,6 @@ public class Dictionary {
|
||||||
} else if ("SFX".equals(firstWord)) {
|
} else if ("SFX".equals(firstWord)) {
|
||||||
parseAffix(
|
parseAffix(
|
||||||
suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
|
suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
|
||||||
} else if ("FLAG".equals(firstWord)) {
|
|
||||||
// Assume that the FLAG line comes before any prefix or suffixes
|
|
||||||
// Store the strategy so it can be used when parsing the dic file
|
|
||||||
flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
|
|
||||||
} else if (line.equals("COMPLEXPREFIXES")) {
|
} else if (line.equals("COMPLEXPREFIXES")) {
|
||||||
complexPrefixes =
|
complexPrefixes =
|
||||||
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
|
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
|
||||||
|
@ -696,46 +686,51 @@ public class Dictionary {
|
||||||
return fstCompiler.compile();
|
return fstCompiler.compile();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** pattern accepts optional BOM + SET + any whitespace */
|
private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
|
||||||
static final Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+");
|
|
||||||
|
/** Parses the encoding and flag format specified in the provided InputStream */
|
||||||
|
private void readConfig(BufferedInputStream stream) throws IOException, ParseException {
|
||||||
|
// I assume we don't support other BOMs (utf16, etc.)? We trivially could,
|
||||||
|
// by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
|
||||||
|
// any such exotic examples.
|
||||||
|
Charset streamCharset;
|
||||||
|
if (maybeConsume(stream, BOM_UTF8)) {
|
||||||
|
streamCharset = StandardCharsets.UTF_8;
|
||||||
|
} else {
|
||||||
|
streamCharset = DEFAULT_CHARSET;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: can these flags change throughout the file? If not then we can abort sooner. And
|
||||||
|
// then we wouldn't even need to create a temp file for the affix stream - a large enough
|
||||||
|
// leading buffer (BufferedInputStream) would be sufficient?
|
||||||
|
LineNumberReader reader = new LineNumberReader(new InputStreamReader(stream, streamCharset));
|
||||||
|
String line;
|
||||||
|
while ((line = reader.readLine()) != null) {
|
||||||
|
String firstWord = line.split("\\s")[0];
|
||||||
|
if ("SET".equals(firstWord)) {
|
||||||
|
decoder = getDecoder(singleArgument(reader, line));
|
||||||
|
} else if ("FLAG".equals(firstWord)) {
|
||||||
|
flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parses the encoding specified in the affix file readable through the provided InputStream
|
* Consume the provided byte sequence in full, if present. Otherwise leave the input stream
|
||||||
|
* intact.
|
||||||
*
|
*
|
||||||
* @param affix InputStream for reading the affix file
|
* @return {@code true} if the sequence matched and has been consumed.
|
||||||
* @return Encoding specified in the affix file
|
|
||||||
* @throws IOException Can be thrown while reading from the InputStream
|
|
||||||
*/
|
*/
|
||||||
static String getDictionaryEncoding(InputStream affix) throws IOException {
|
private static boolean maybeConsume(BufferedInputStream stream, byte[] bytes) throws IOException {
|
||||||
final StringBuilder encoding = new StringBuilder();
|
stream.mark(bytes.length);
|
||||||
for (; ; ) {
|
for (int i = 0; i < bytes.length; i++) {
|
||||||
encoding.setLength(0);
|
int nextByte = stream.read();
|
||||||
int ch;
|
if (nextByte != (bytes[i] & 0xff)) { // covers EOF (-1) as well.
|
||||||
while ((ch = affix.read()) >= 0) {
|
stream.reset();
|
||||||
if (ch == '\n') {
|
return false;
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (ch != '\r') {
|
|
||||||
encoding.append((char) ch);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (encoding.length() == 0
|
|
||||||
|| encoding.charAt(0) == '#'
|
|
||||||
||
|
|
||||||
// this test only at the end as ineffective but would allow lines only containing spaces:
|
|
||||||
encoding.toString().trim().length() == 0) {
|
|
||||||
if (ch < 0) {
|
|
||||||
return DEFAULT_CHARSET.name();
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Matcher matcher = ENCODING_PATTERN.matcher(encoding);
|
|
||||||
if (matcher.find()) {
|
|
||||||
int last = matcher.end();
|
|
||||||
return encoding.substring(last).trim();
|
|
||||||
}
|
|
||||||
return DEFAULT_CHARSET.name();
|
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static final Map<String, String> CHARSET_ALIASES =
|
static final Map<String, String> CHARSET_ALIASES =
|
||||||
|
@ -748,7 +743,7 @@ public class Dictionary {
|
||||||
* @param encoding Encoding to retrieve the CharsetDecoder for
|
* @param encoding Encoding to retrieve the CharsetDecoder for
|
||||||
* @return CharSetDecoder for the given encoding
|
* @return CharSetDecoder for the given encoding
|
||||||
*/
|
*/
|
||||||
private CharsetDecoder getJavaEncoding(String encoding) {
|
private CharsetDecoder getDecoder(String encoding) {
|
||||||
if ("ISO8859-14".equals(encoding)) {
|
if ("ISO8859-14".equals(encoding)) {
|
||||||
return new ISO8859_14Decoder();
|
return new ISO8859_14Decoder();
|
||||||
}
|
}
|
||||||
|
@ -756,7 +751,10 @@ public class Dictionary {
|
||||||
if (canon != null) {
|
if (canon != null) {
|
||||||
encoding = canon;
|
encoding = canon;
|
||||||
}
|
}
|
||||||
Charset charset = Charset.forName(encoding);
|
return replacingDecoder(Charset.forName(encoding));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static CharsetDecoder replacingDecoder(Charset charset) {
|
||||||
return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
|
return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ import java.nio.charset.Charset;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
@ -142,6 +143,20 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
tempDir.close();
|
tempDir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testUsingFlagsBeforeFlagDirective() throws IOException, ParseException {
|
||||||
|
byte[] aff = "KEEPCASE 42\nFLAG num".getBytes(StandardCharsets.UTF_8);
|
||||||
|
byte[] dic = "1\nfoo/42".getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
Dictionary dictionary =
|
||||||
|
new Dictionary(
|
||||||
|
new ByteBuffersDirectory(),
|
||||||
|
"",
|
||||||
|
new ByteArrayInputStream(aff),
|
||||||
|
new ByteArrayInputStream(dic));
|
||||||
|
|
||||||
|
assertEquals(42, dictionary.keepcase);
|
||||||
|
}
|
||||||
|
|
||||||
// malformed flags causes ParseException
|
// malformed flags causes ParseException
|
||||||
public void testInvalidFlags() throws Exception {
|
public void testInvalidFlags() throws Exception {
|
||||||
InputStream affixStream = getClass().getResourceAsStream("broken-flags.aff");
|
InputStream affixStream = getClass().getResourceAsStream("broken-flags.aff");
|
||||||
|
@ -245,25 +260,21 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSetWithCrazyWhitespaceAndBOMs() throws Exception {
|
public void testSetWithCrazyWhitespaceAndBOMs() throws Exception {
|
||||||
assertEquals(
|
assertEquals("UTF-8", getDictionaryEncoding("SET\tUTF-8\n"));
|
||||||
"UTF-8",
|
assertEquals("UTF-8", getDictionaryEncoding("SET\t UTF-8\n"));
|
||||||
Dictionary.getDictionaryEncoding(
|
assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\n"));
|
||||||
new ByteArrayInputStream("SET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
|
assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\r\n"));
|
||||||
assertEquals(
|
assertEquals(Dictionary.DEFAULT_CHARSET.name(), getDictionaryEncoding(""));
|
||||||
"UTF-8",
|
}
|
||||||
Dictionary.getDictionaryEncoding(
|
|
||||||
new ByteArrayInputStream("SET\t UTF-8\n".getBytes(StandardCharsets.UTF_8))));
|
private static String getDictionaryEncoding(String affFile) throws IOException, ParseException {
|
||||||
assertEquals(
|
Dictionary dictionary =
|
||||||
"UTF-8",
|
new Dictionary(
|
||||||
Dictionary.getDictionaryEncoding(
|
new ByteBuffersDirectory(),
|
||||||
new ByteArrayInputStream("\uFEFFSET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
|
"",
|
||||||
assertEquals(
|
new ByteArrayInputStream(affFile.getBytes(StandardCharsets.UTF_8)),
|
||||||
"UTF-8",
|
new ByteArrayInputStream("1\nmock".getBytes(StandardCharsets.UTF_8)));
|
||||||
Dictionary.getDictionaryEncoding(
|
return dictionary.decoder.charset().name();
|
||||||
new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8))));
|
|
||||||
assertEquals(
|
|
||||||
Dictionary.DEFAULT_CHARSET.name(),
|
|
||||||
Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0])));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFlagWithCrazyWhitespace() {
|
public void testFlagWithCrazyWhitespace() {
|
||||||
|
|
Loading…
Reference in New Issue