LUCENE-8971: Enable constructing JapaneseTokenizer with custom dictionary

This commit is contained in:
Michael Sokolov 2019-09-06 08:34:32 -04:00 committed by Michael Sokolov
parent c514b29b24
commit 770464ec20
6 changed files with 86 additions and 11 deletions

View File

@ -202,7 +202,7 @@ public final class JapaneseTokenizer extends Tokenizer {
}
/**
* Create a new JapaneseTokenizer.
* Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
*
* @param factory the AttributeFactory to use
* @param userDictionary Optional: if non-null, user dictionary.
@ -211,13 +211,40 @@ public final class JapaneseTokenizer extends Tokenizer {
*/
public JapaneseTokenizer
(AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
this(factory,
TokenInfoDictionary.getInstance(),
UnknownDictionary.getInstance(),
ConnectionCosts.getInstance(),
userDictionary, discardPunctuation, mode);
}
/**
* Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
* <p>
* Uses the default AttributeFactory.
*
* @param factory the AttributeFactory to use
* @param systemDictionary a custom known token dictionary
* @param unkDictionary a custom unknown token dictionary
* @param connectionCosts custom token transition costs
* @param userDictionary Optional: if non-null, user dictionary.
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
* @param mode tokenization mode.
*/
public JapaneseTokenizer(AttributeFactory factory,
TokenInfoDictionary systemDictionary,
UnknownDictionary unkDictionary,
ConnectionCosts connectionCosts,
UserDictionary userDictionary,
boolean discardPunctuation,
Mode mode) {
super(factory);
dictionary = TokenInfoDictionary.getInstance();
fst = dictionary.getFST();
unkDictionary = UnknownDictionary.getInstance();
characterDefinition = unkDictionary.getCharacterDefinition();
this.dictionary = systemDictionary;
this.fst = dictionary.getFST();
this.unkDictionary = unkDictionary;
this.characterDefinition = unkDictionary.getCharacterDefinition();
this.userDictionary = userDictionary;
costs = ConnectionCosts.getInstance();
this.costs = connectionCosts;
fstReader = fst.getBytesReader();
if (userDictionary != null) {
userFST = userDictionary.getFST();

View File

@ -176,6 +176,17 @@ public abstract class BinaryDictionary implements Dictionary {
}
}
public static final InputStream getResource(ResourceScheme scheme, String path) throws IOException {
switch(scheme) {
case CLASSPATH:
return getClassResource(path);
case FILE:
return Files.newInputStream(Paths.get(path));
default:
throw new IllegalStateException("unknown resource scheme " + scheme);
}
}
// util, reused by ConnectionCosts and CharacterDefinition
public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
@ -185,7 +196,7 @@ public abstract class BinaryDictionary implements Dictionary {
return is;
}
private InputStream getClassResource(String path) throws IOException {
private static InputStream getClassResource(String path) throws IOException {
final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path);
if (is == null) {
throw new FileNotFoundException("Not in classpath: " + path);

View File

@ -37,12 +37,16 @@ public final class ConnectionCosts {
private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
private ConnectionCosts() throws IOException {
/**
* @param scheme - scheme for loading resources (FILE or CLASSPATH).
* @param path - where to load resources from, without the ".dat" suffix
*/
public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String path) throws IOException {
InputStream is = null;
short[][] costs = null;
boolean success = false;
try {
is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
is = BinaryDictionary.getResource(scheme, path.replace('.', '/') + FILENAME_SUFFIX);
is = new BufferedInputStream(is);
final DataInput in = new InputStreamDataInput(is);
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
@ -68,7 +72,11 @@ public final class ConnectionCosts {
this.costs = costs;
}
private ConnectionCosts() throws IOException {
this(BinaryDictionary.ResourceScheme.CLASSPATH, ConnectionCosts.class.getName());
}
public int get(int forwardId, int backwardId) {
return costs[backwardId][forwardId];
}

View File

@ -40,7 +40,7 @@ public final class TokenInfoDictionary extends BinaryDictionary {
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
* this class's name as the path.
*/
TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
super(resourceScheme, resourcePath);
FST<Long> fst;
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {

View File

@ -26,6 +26,15 @@ public final class UnknownDictionary extends BinaryDictionary {
private final CharacterDefinition characterDefinition = CharacterDefinition.getInstance();
/**
* @param scheme scheme for loading resources (FILE or CLASSPATH).
* @param path where to load resources from; a path, including the file base name without
* extension; this is used to match multiple files with the same base name.
*/
public UnknownDictionary(ResourceScheme scheme, String path) throws IOException {
super(scheme, path);
}
private UnknownDictionary() throws IOException {
super();
}

View File

@ -33,7 +33,10 @@ import org.apache.lucene.analysis.MockGraphTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
import org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.apache.lucene.analysis.ja.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -441,6 +444,23 @@ public class
);
}
// Make sure loading custom dictionaries from classpath works:
public void testCustomDictionary() throws Exception {
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(),
new TokenInfoDictionary(ResourceScheme.CLASSPATH, "org/apache/lucene/analysis/ja/dict/TokenInfoDictionary"),
new UnknownDictionary(ResourceScheme.CLASSPATH, "org/apache/lucene/analysis/ja/dict/UnknownDictionary"),
new ConnectionCosts(ResourceScheme.CLASSPATH, "org/apache/lucene/analysis/ja/dict/ConnectionCosts"),
readDict(), true, Mode.SEARCH);
try (Analyzer a = makeAnalyzer(tokenizer)) {
assertTokenStreamContents(a.tokenStream("foo", "abcd"),
new String[] { "a", "b", "cd" },
new int[] { 0, 1, 2 },
new int[] { 1, 2, 4 },
4
);
}
}
// HMM: fails (segments as a/b/cd/efghij)... because the
// two paths have exactly equal paths (1 KNOWN + 1
// UNKNOWN) and we don't seem to favor longer KNOWN /