mirror of https://github.com/apache/lucene.git
LUCENE-8971: Enable constructing JapaneseTokenizer with custom dictionary
This commit is contained in:
parent
c514b29b24
commit
770464ec20
|
@ -202,7 +202,7 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Create a new JapaneseTokenizer.
|
||||
* Create a new JapaneseTokenizer using the system and unknown dictionaries shipped with Lucene.
|
||||
*
|
||||
* @param factory the AttributeFactory to use
|
||||
* @param userDictionary Optional: if non-null, user dictionary.
|
||||
|
@ -211,13 +211,40 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
*/
|
||||
public JapaneseTokenizer
|
||||
(AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
|
||||
this(factory,
|
||||
TokenInfoDictionary.getInstance(),
|
||||
UnknownDictionary.getInstance(),
|
||||
ConnectionCosts.getInstance(),
|
||||
userDictionary, discardPunctuation, mode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
|
||||
* <p>
|
||||
* Uses the default AttributeFactory.
|
||||
*
|
||||
* @param factory the AttributeFactory to use
|
||||
* @param systemDictionary a custom known token dictionary
|
||||
* @param unkDictionary a custom unknown token dictionary
|
||||
* @param connectionCosts custom token transition costs
|
||||
* @param userDictionary Optional: if non-null, user dictionary.
|
||||
* @param discardPunctuation true if punctuation tokens should be dropped from the output.
|
||||
* @param mode tokenization mode.
|
||||
*/
|
||||
public JapaneseTokenizer(AttributeFactory factory,
|
||||
TokenInfoDictionary systemDictionary,
|
||||
UnknownDictionary unkDictionary,
|
||||
ConnectionCosts connectionCosts,
|
||||
UserDictionary userDictionary,
|
||||
boolean discardPunctuation,
|
||||
Mode mode) {
|
||||
super(factory);
|
||||
dictionary = TokenInfoDictionary.getInstance();
|
||||
fst = dictionary.getFST();
|
||||
unkDictionary = UnknownDictionary.getInstance();
|
||||
characterDefinition = unkDictionary.getCharacterDefinition();
|
||||
this.dictionary = systemDictionary;
|
||||
this.fst = dictionary.getFST();
|
||||
this.unkDictionary = unkDictionary;
|
||||
this.characterDefinition = unkDictionary.getCharacterDefinition();
|
||||
this.userDictionary = userDictionary;
|
||||
costs = ConnectionCosts.getInstance();
|
||||
this.costs = connectionCosts;
|
||||
fstReader = fst.getBytesReader();
|
||||
if (userDictionary != null) {
|
||||
userFST = userDictionary.getFST();
|
||||
|
|
|
@ -176,6 +176,17 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
}
|
||||
}
|
||||
|
||||
public static final InputStream getResource(ResourceScheme scheme, String path) throws IOException {
|
||||
switch(scheme) {
|
||||
case CLASSPATH:
|
||||
return getClassResource(path);
|
||||
case FILE:
|
||||
return Files.newInputStream(Paths.get(path));
|
||||
default:
|
||||
throw new IllegalStateException("unknown resource scheme " + scheme);
|
||||
}
|
||||
}
|
||||
|
||||
// util, reused by ConnectionCosts and CharacterDefinition
|
||||
public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
|
||||
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
|
||||
|
@ -185,7 +196,7 @@ public abstract class BinaryDictionary implements Dictionary {
|
|||
return is;
|
||||
}
|
||||
|
||||
private InputStream getClassResource(String path) throws IOException {
|
||||
private static InputStream getClassResource(String path) throws IOException {
|
||||
final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path);
|
||||
if (is == null) {
|
||||
throw new FileNotFoundException("Not in classpath: " + path);
|
||||
|
|
|
@ -37,12 +37,16 @@ public final class ConnectionCosts {
|
|||
|
||||
private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
|
||||
|
||||
private ConnectionCosts() throws IOException {
|
||||
/**
|
||||
* @param scheme - scheme for loading resources (FILE or CLASSPATH).
|
||||
* @param path - where to load resources from, without the ".dat" suffix
|
||||
*/
|
||||
public ConnectionCosts(BinaryDictionary.ResourceScheme scheme, String path) throws IOException {
|
||||
InputStream is = null;
|
||||
short[][] costs = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
|
||||
is = BinaryDictionary.getResource(scheme, path.replace('.', '/') + FILENAME_SUFFIX);
|
||||
is = new BufferedInputStream(is);
|
||||
final DataInput in = new InputStreamDataInput(is);
|
||||
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
|
||||
|
@ -68,7 +72,11 @@ public final class ConnectionCosts {
|
|||
|
||||
this.costs = costs;
|
||||
}
|
||||
|
||||
|
||||
private ConnectionCosts() throws IOException {
|
||||
this(BinaryDictionary.ResourceScheme.CLASSPATH, ConnectionCosts.class.getName());
|
||||
}
|
||||
|
||||
public int get(int forwardId, int backwardId) {
|
||||
return costs[backwardId][forwardId];
|
||||
}
|
||||
|
|
|
@ -40,7 +40,7 @@ public final class TokenInfoDictionary extends BinaryDictionary {
|
|||
* @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
|
||||
* this class's name as the path.
|
||||
*/
|
||||
TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
|
||||
public TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
|
||||
super(resourceScheme, resourcePath);
|
||||
FST<Long> fst;
|
||||
try (InputStream is = new BufferedInputStream(getResource(FST_FILENAME_SUFFIX))) {
|
||||
|
|
|
@ -26,6 +26,15 @@ public final class UnknownDictionary extends BinaryDictionary {
|
|||
|
||||
private final CharacterDefinition characterDefinition = CharacterDefinition.getInstance();
|
||||
|
||||
/**
|
||||
* @param scheme scheme for loading resources (FILE or CLASSPATH).
|
||||
* @param path where to load resources from; a path, including the file base name without
|
||||
* extension; this is used to match multiple files with the same base name.
|
||||
*/
|
||||
public UnknownDictionary(ResourceScheme scheme, String path) throws IOException {
|
||||
super(scheme, path);
|
||||
}
|
||||
|
||||
private UnknownDictionary() throws IOException {
|
||||
super();
|
||||
}
|
||||
|
|
|
@ -33,7 +33,10 @@ import org.apache.lucene.analysis.MockGraphTokenFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
|
||||
import org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;
|
||||
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
|
||||
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
|
||||
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
|
||||
import org.apache.lucene.analysis.ja.dict.UserDictionary;
|
||||
import org.apache.lucene.analysis.ja.tokenattributes.*;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
@ -441,6 +444,23 @@ public class
|
|||
);
|
||||
}
|
||||
|
||||
// Make sure loading custom dictionaries from classpath works:
|
||||
public void testCustomDictionary() throws Exception {
|
||||
Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(),
|
||||
new TokenInfoDictionary(ResourceScheme.CLASSPATH, "org/apache/lucene/analysis/ja/dict/TokenInfoDictionary"),
|
||||
new UnknownDictionary(ResourceScheme.CLASSPATH, "org/apache/lucene/analysis/ja/dict/UnknownDictionary"),
|
||||
new ConnectionCosts(ResourceScheme.CLASSPATH, "org/apache/lucene/analysis/ja/dict/ConnectionCosts"),
|
||||
readDict(), true, Mode.SEARCH);
|
||||
try (Analyzer a = makeAnalyzer(tokenizer)) {
|
||||
assertTokenStreamContents(a.tokenStream("foo", "abcd"),
|
||||
new String[] { "a", "b", "cd" },
|
||||
new int[] { 0, 1, 2 },
|
||||
new int[] { 1, 2, 4 },
|
||||
4
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// HMM: fails (segments as a/b/cd/efghij)... because the
|
||||
// two paths have exactly equal paths (1 KNOWN + 1
|
||||
// UNKNOWN) and we don't seem to favor longer KNOWN /
|
||||
|
|
Loading…
Reference in New Issue