LUCENE-3305: add Kuromoji Japanese morphological analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1230748 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-12 20:10:48 +00:00
parent ec071905fb
commit cd372bdc83
81 changed files with 7482 additions and 49 deletions

View File

@ -24,6 +24,9 @@
<classpathentry kind="src" path="modules/analysis/icu/src/java"/>
<classpathentry kind="src" path="modules/analysis/icu/src/resources"/>
<classpathentry kind="src" path="modules/analysis/icu/src/test"/>
<classpathentry kind="src" path="modules/analysis/kuromoji/src/java"/>
<classpathentry kind="src" path="modules/analysis/kuromoji/src/resources"/>
<classpathentry kind="src" path="modules/analysis/kuromoji/src/test"/>
<classpathentry kind="src" path="modules/analysis/phonetic/src/java"/>
<classpathentry kind="src" path="modules/analysis/phonetic/src/test"/>
<classpathentry kind="src" path="modules/analysis/smartcn/src/java"/>

View File

@ -196,6 +196,17 @@
<property name="analyzers-stempel.uptodate" value="true"/>
</target>
<property name="analyzers-kuromoji.jar" value="${common.dir}/../modules/analysis/build/kuromoji/lucene-analyzers-kuromoji-${version}.jar"/>
<target name="check-analyzers-kuromoji-uptodate" unless="analyzers-kuromoji.uptodate">
<module-uptodate name="analysis/kuromoji" jarfile="${analyzers-kuromoji.jar}" property="analyzers-kuromoji.uptodate"/>
</target>
<target name="jar-analyzers-kuromoji" unless="analyzers-kuromoji.uptodate" depends="check-analyzers-kuromoji-uptodate">
<ant dir="${common.dir}/../modules/analysis/kuromoji" target="jar-core" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
<property name="analyzers-kuromoji.uptodate" value="true"/>
</target>
<property name="grouping.jar" value="${common.dir}/../modules/grouping/build/lucene-grouping-${version}.jar"/>
<target name="check-grouping-uptodate" unless="grouping.uptodate">
<module-uptodate name="grouping" jarfile="${grouping.jar}" property="grouping.uptodate"/>

View File

@ -42,6 +42,11 @@ API Changes
since they prevent reuse. Both Analyzers should be configured at instantiation.
(Chris Male)
* LUCENE-3305: Added SegmentingTokenizerBase, which breaks text into sentences
with BreakIterator and allows subclasses to decompose sentences into words, or
use the sentence boundary information for other reasons (e.g. attribute/position increment)
(Robert Muir)
New Features
* LUCENE-2341: A new analyzer/ filter: Morfologik - a dictionary-driven lemmatizer
@ -109,6 +114,9 @@ New Features
* LUCENE-3414: Added HunspellStemFilter which uses a provided pure Java implementation of the
Hunspell algorithm. (Chris Male)
* LUCENE-3305: Added Kuromoji morphological analyzer for Japanese.
(Christian Moen, Masaru Hasegawa, Simon Willnauer, Uwe Schindler, Robert Muir)
Build
* LUCENE-2413: All analyzers in contrib/analyzers and contrib/icu were moved to the

View File

@ -71,3 +71,86 @@ LGPL and Creative Commons ShareAlike.
Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
(http://sgjp.pl/morfeusz/)
===========================================================================
Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
===========================================================================
This software includes a binary and/or source version of data from
mecab-ipadic-2.7.0-20070801
which can be obtained from
http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
or
http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
===========================================================================
mecab-ipadic-2.7.0-20070801 Notice
===========================================================================
Nara Institute of Science and Technology (NAIST),
the copyright holders, disclaims all warranties with regard to this
software, including all implied warranties of merchantability and
fitness, in no event shall NAIST be liable for
any special, indirect or consequential damages or any damages
whatsoever resulting from loss of use, data or profits, whether in an
action of contract, negligence or other tortuous action, arising out
of or in connection with the use or performance of this software.
A large portion of the dictionary entries
originate from ICOT Free Software. The following conditions for ICOT
Free Software applies to the current dictionary as well.
Each User may also freely distribute the Program, whether in its
original form or modified, to any third party or parties, PROVIDED
that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
on, or be attached to, the Program, which is distributed substantially
in the same form as set out herein and that such intended
distribution, if actually made, will neither violate or otherwise
contravene any of the laws and regulations of the countries having
jurisdiction over the User or the intended distribution itself.
NO WARRANTY
The program was produced on an experimental basis in the course of the
research and development conducted during the project and is provided
to users as so produced on an experimental basis. Accordingly, the
program is provided without any warranty whatsoever, whether express,
implied, statutory or otherwise. The term "warranty" used herein
includes, but is not limited to, any warranty of the quality,
performance, merchantability and fitness for a particular purpose of
the program and the nonexistence of any infringement or violation of
any right of any third party.
Each user of the program will agree and understand, and be deemed to
have agreed and understood, that there is no warranty whatsoever for
the program and, accordingly, the entire risk arising from or
otherwise connected with the program is assumed by the user.
Therefore, neither ICOT, the copyright holder, or any other
organization that participated in or was otherwise related to the
development of the program and their respective officials, directors,
officers and other employees shall be held liable for any and all
damages, including, without limitation, general, special, incidental
and consequential damages, arising out of or otherwise in connection
with the use or inability to use the program or any product, material
or result produced or otherwise obtained by using the program,
regardless of whether they have been advised of, or otherwise had
knowledge of, the possibility of such damages at any time during the
project or thereafter. Each user will be deemed to have agreed to the
foregoing by his or her commencement of use of the program. The term
"use" as used herein includes, but is not limited to, the use,
modification, copying and distribution of the program and the
production of secondary products from the program.
In the case where the program, whether in its original form or
modified, was distributed or delivered to or received by a user from
any person, organization or entity other than ICOT, unless it makes or
grants independently of ICOT any specific warranty to the user in
writing, such person, organization or entity, will also be exempted
from and not be held liable to the user for any such damages as noted
above as far as the program is concerned.

View File

@ -22,6 +22,12 @@ lucene-analyzers-icu-XX.jar
International Components for Unicode (ICU). Note: this module depends on
the ICU4j jar file (version >= 4.6.0)
lucene-analyzers-kuromoji-XX.jar
An analyzer with morphological analysis for Japanese.
lucene-analyzers-morfologik-XX.jar
An analyzer using the Morfologik stemming library.
lucene-analyzers-phonetic-XX.jar
An add-on analysis library that provides phonetic encoders via Apache
Commons-Codec. Note: this module depends on the commons-codec jar
@ -35,21 +41,20 @@ lucene-analyzers-stempel-XX.jar
An add-on analysis library that contains a universal algorithmic stemmer,
including tables for the Polish language.
lucene-analyzers-morfologik-XX.jar
An analyzer using the Morfologik stemming library.
common/src/java
icu/src/java
kuromoji/src/java
morfologik/src/java
phonetic/src/java
smartcn/src/java
stempel/src/java
morfologik/src/java
The source code for the ffve libraries.
The source code for the libraries.
common/src/test
icu/src/test
kuromoji/src/test
morfologik/src/test
phonetic/src/test
smartcn/src/test
stempel/src/test
morfologik/src/test
Unit tests for the five libraries.
Unit tests for the libraries.

View File

@ -23,9 +23,10 @@
Additional Analyzers
- common: Additional Analyzers
- icu: Analyzers that use functionality from ICU
- kuromoji: Japanese Morphological Analyzer
- morfologik: Morfologik Stemmer
- smartcn: Smart Analyzer for Simplified Chinese Text
- stempel: Algorithmic Stemmer for Polish
- morfologik: Morfologik Stemmer
</description>
<target name="common">
@ -36,6 +37,14 @@
<ant dir="icu" />
</target>
<target name="kuromoji">
<ant dir="kuromoji" />
</target>
<target name="morfologik">
<ant dir="morfologik" />
</target>
<target name="phonetic">
<ant dir="phonetic" />
</target>
@ -48,52 +57,53 @@
<ant dir="stempel" />
</target>
<target name="morfologik">
<ant dir="morfologik" />
</target>
<target name="default" depends="compile"/>
<target name="compile" depends="common,icu,phonetic,smartcn,stempel,morfologik" />
<target name="compile" depends="common,icu,kuromoji,morfologik,phonetic,smartcn,stempel" />
<target name="clean">
<ant dir="common" target="clean" />
<ant dir="icu" target="clean" />
<ant dir="kuromoji" target="clean"/>
<ant dir="morfologik" target="clean" />
<ant dir="phonetic" target="clean" />
<ant dir="smartcn" target="clean" />
<ant dir="stempel" target="clean" />
<ant dir="morfologik" target="clean" />
</target>
<target name="validate">
<ant dir="common" target="validate" />
<ant dir="icu" target="validate" />
<ant dir="kuromoji" target="validate" />
<ant dir="morfologik" target="validate" />
<ant dir="phonetic" target="validate" />
<ant dir="smartcn" target="validate" />
<ant dir="stempel" target="validate" />
<ant dir="morfologik" target="validate" />
</target>
<target name="compile-core">
<ant dir="common" target="compile-core" />
<ant dir="icu" target="compile-core" />
<ant dir="kuromoji" target="compile-core" />
<ant dir="morfologik" target="compile-core" />
<ant dir="phonetic" target="compile-core" />
<ant dir="smartcn" target="compile-core" />
<ant dir="stempel" target="compile-core" />
<ant dir="morfologik" target="compile-core" />
</target>
<target name="compile-test">
<ant dir="common" target="compile-test" />
<ant dir="icu" target="compile-test" />
<ant dir="kuromoji" target="compile-test" />
<ant dir="morfologik" target="compile-test" />
<ant dir="phonetic" target="compile-test" />
<ant dir="smartcn" target="compile-test" />
<ant dir="stempel" target="compile-test" />
<ant dir="morfologik" target="compile-test" />
</target>
<target name="test">
<ant dir="common" target="test" />
<ant dir="icu" target="test" />
<ant dir="kuromoji" target="test" />
<ant dir="morfologik" target="test" />
<ant dir="phonetic" target="test" />
<ant dir="smartcn" target="test" />
<ant dir="stempel" target="test" />
<ant dir="morfologik" target="test" />
</target>
<target name="build-artifacts-and-tests" depends="default,compile-test" />
@ -101,28 +111,31 @@
<target name="dist-maven" depends="default,javadocs">
<ant dir="common" target="dist-maven" />
<ant dir="icu" target="dist-maven" />
<ant dir="kuromoji" target="dist-maven" />
<ant dir="morfologik" target="dist-maven" />
<ant dir="phonetic" target="dist-maven" />
<ant dir="smartcn" target="dist-maven" />
<ant dir="stempel" target="dist-maven" />
<ant dir="morfologik" target="dist-maven" />
</target>
<target name="javadocs">
<ant dir="common" target="javadocs" />
<ant dir="icu" target="javadocs" />
<ant dir="kuromoji" target="javadocs" />
<ant dir="morfologik" target="javadocs" />
<ant dir="phonetic" target="javadocs" />
<ant dir="smartcn" target="javadocs" />
<ant dir="stempel" target="javadocs" />
<ant dir="morfologik" target="javadocs" />
</target>
<target name="javadocs-index.html">
<ant dir="common" target="javadocs-index.html" />
<ant dir="icu" target="javadocs-index.html" />
<ant dir="kuromoji" target="javadocs-index.html" />
<ant dir="morfologik" target="javadocs-index.html" />
<ant dir="phonetic" target="javadocs-index.html" />
<ant dir="smartcn" target="javadocs-index.html" />
<ant dir="stempel" target="javadocs-index.html" />
<ant dir="morfologik" target="javadocs-index.html" />
</target>
</project>

View File

@ -0,0 +1,180 @@
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.text.BreakIterator;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
* Breaks text into sentences with a {@link BreakIterator} and
* allows subclasses to decompose these sentences into words.
* <p>
* This can be used by subclasses that need sentence context
* for tokenization purposes, such as CJK segmenters.
* <p>
* Additionally it can be used by subclasses that want to mark
* sentence boundaries (with a custom attribute, extra token, position
* increment, etc) for downstream processing.
*
* @lucene.experimental
*/
public abstract class SegmentingTokenizerBase extends Tokenizer {
protected static final int BUFFERMAX = 4096;
protected final char buffer[] = new char[BUFFERMAX];
/** true length of text in the buffer */
private int length = 0;
/** length in buffer that can be evaluated safely, up to a safe end point */
private int usableLength = 0;
/** accumulated offset of previous buffers for this reader, for offsetAtt */
protected int offset = 0;
private final BreakIterator iterator;
private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/**
* Construct a new SegmenterBase from the given Reader, using
* the provided BreakIterator for sentence segmentation.
* <p>
* Note that you should never share BreakIterators across different
* TokenStreams, instead a newly created or cloned one should always
* be provided to this constructor.
*/
public SegmentingTokenizerBase(Reader input, BreakIterator iterator) {
super(input);
this.iterator = iterator;
}
@Override
public final boolean incrementToken() throws IOException {
if (length == 0 || !incrementWord()) {
while (!incrementSentence()) {
refill();
if (length <= 0) // no more bytes to read;
return false;
}
}
return true;
}
@Override
public void reset() throws IOException {
wrapper.setText(buffer, 0, 0);
iterator.setText(wrapper);
length = usableLength = offset = 0;
}
@Override
public void reset(Reader input) throws IOException {
this.input = input;
reset();
}
@Override
public final void end() throws IOException {
final int finalOffset = correctOffset(length < 0 ? offset : offset + length);
offsetAtt.setOffset(finalOffset, finalOffset);
}
/** Returns the last unambiguous break position in the text. */
private int findSafeEnd() {
for (int i = length - 1; i >= 0; i--)
if (isSafeEnd(buffer[i]))
return i + 1;
return -1;
}
/** For sentence tokenization, these are the unambiguous break positions. */
protected boolean isSafeEnd(char ch) {
switch(ch) {
case 0x000D:
case 0x000A:
case 0x0085:
case 0x2028:
case 0x2029:
return true;
default:
return false;
}
}
/**
* Refill the buffer, accumulating the offset and setting usableLength to the
* last unambiguous break position
*/
private void refill() throws IOException {
offset += usableLength;
int leftover = length - usableLength;
System.arraycopy(buffer, usableLength, buffer, 0, leftover);
int requested = buffer.length - leftover;
int returned = input.read(buffer, leftover, requested);
length = returned < 0 ? leftover : returned + leftover;
if (returned < requested) /* reader has been emptied, process the rest */
usableLength = length;
else { /* still more data to be read, find a safe-stopping place */
usableLength = findSafeEnd();
if (usableLength < 0)
usableLength = length; /*
* more than IOBUFFER of text without breaks,
* gonna possibly truncate tokens
*/
}
wrapper.setText(buffer, 0, Math.max(0, usableLength));
iterator.setText(wrapper);
}
/**
* return true if there is a token from the buffer, or null if it is
* exhausted.
*/
private boolean incrementSentence() throws IOException {
if (length == 0) // we must refill the buffer
return false;
while (true) {
int start = iterator.current();
if (start == BreakIterator.DONE)
return false; // BreakIterator exhausted
// find the next set of boundaries
int end = iterator.next();
if (end == BreakIterator.DONE)
return false; // BreakIterator exhausted
setNextSentence(start, end);
if (incrementWord()) {
return true;
}
}
}
/** Provides the next input sentence for analysis */
protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);
/** Returns true if another word is available */
protected abstract boolean incrementWord();
}

View File

@ -0,0 +1,224 @@
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/** Basic tests for {@link SegmentingTokenizerBase} */
public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
private Analyzer sentence = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new WholeSentenceTokenizer(reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
private Analyzer sentenceAndWord = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new SentenceAndWordTokenizer(reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
/** Some simple examples, just outputting the whole sentence boundaries as "terms" */
public void testBasics() throws IOException {
assertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence",
new String[] { "The acronym for United States is U.S. but this doesn't end a sentence"}
);
assertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.",
new String[] { "He said, \"Are you going?\" ",
"John shook his head." }
);
}
/** Test a subclass that sets some custom attribute values */
public void testCustomAttributes() throws IOException {
assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.",
new String[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" },
new int[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 },
new int[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 },
new int[] { 1, 1, 1, 1, 1, 2, 1, 1, 1 }
);
}
/** Tests tokenstream reuse */
public void testReuse() throws IOException {
assertAnalyzesToReuse(sentenceAndWord, "He said, \"Are you going?\"",
new String[] { "He", "said", "Are", "you", "going" },
new int[] { 0, 3, 10, 14, 18 },
new int[] { 2, 7, 13, 17, 23 },
new int[] { 1, 1, 1, 1, 1,}
);
assertAnalyzesToReuse(sentenceAndWord, "John shook his head.",
new String[] { "John", "shook", "his", "head" },
new int[] { 0, 5, 11, 15 },
new int[] { 4, 10, 14, 19 },
new int[] { 1, 1, 1, 1 }
);
}
/** Tests TokenStream.end() */
public void testEnd() throws IOException {
// BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
// we add some junk whitespace to the end just to test it.
assertAnalyzesTo(sentenceAndWord, "John shook his head ",
new String[] { "John", "shook", "his", "head" }
);
assertAnalyzesTo(sentenceAndWord, "John shook his head. ",
new String[] { "John", "shook", "his", "head" }
);
}
/** Tests terms which span across boundaries */
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
char whitespace[] = new char[4094];
Arrays.fill(whitespace, '\n');
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
assertAnalyzesTo(sentenceAndWord, input, new String[] { "testing", "1234" });
}
/** Tests the handling of binary/malformed data */
public void testHugeTerm() throws IOException {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 40960; i++) {
sb.append('a');
}
String input = sb.toString();
char token[] = new char[4096];
Arrays.fill(token, 'a');
String expectedToken = new String(token);
String expected[] = {
expectedToken, expectedToken, expectedToken,
expectedToken, expectedToken, expectedToken,
expectedToken, expectedToken, expectedToken,
expectedToken
};
assertAnalyzesTo(sentence, input, expected);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, sentence, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, sentenceAndWord, 10000*RANDOM_MULTIPLIER);
}
// some tokenizers for testing
/** silly tokenizer that just returns whole sentences as tokens */
static class WholeSentenceTokenizer extends SegmentingTokenizerBase {
int sentenceStart, sentenceEnd;
boolean hasSentence;
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public WholeSentenceTokenizer(Reader input) {
super(input, BreakIterator.getSentenceInstance(new Locale("")));
}
@Override
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
this.sentenceStart = sentenceStart;
this.sentenceEnd = sentenceEnd;
hasSentence = true;
}
@Override
protected boolean incrementWord() {
if (hasSentence) {
hasSentence = false;
clearAttributes();
termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
offsetAtt.setOffset(offset+sentenceStart, offset+sentenceEnd);
return true;
} else {
return false;
}
}
}
/**
* simple tokenizer, that bumps posinc + 1 for tokens after a
* sentence boundary to inhibit phrase queries without slop.
*/
static class SentenceAndWordTokenizer extends SegmentingTokenizerBase {
int sentenceStart, sentenceEnd;
int wordStart, wordEnd;
int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
public SentenceAndWordTokenizer(Reader input) {
super(input, BreakIterator.getSentenceInstance(new Locale("")));
}
@Override
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
this.sentenceEnd = sentenceEnd;
posBoost++;
}
@Override
public void reset() throws IOException {
super.reset();
posBoost = -1;
}
@Override
protected boolean incrementWord() {
wordStart = wordEnd;
while (wordStart < sentenceEnd) {
if (Character.isLetterOrDigit(buffer[wordStart]))
break;
wordStart++;
}
if (wordStart == sentenceEnd) return false;
wordEnd = wordStart+1;
while (wordEnd < sentenceEnd && Character.isLetterOrDigit(buffer[wordEnd]))
wordEnd++;
clearAttributes();
termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
offsetAtt.setOffset(offset+wordStart, offset+wordEnd);
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
posBoost = 0;
return true;
}
}
}

View File

@ -0,0 +1,121 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="analyzers-kuromoji" default="default">
<description>
Kuromoji Japanese Morphological Analyzer
</description>
<property name="build.dir" location="../build/kuromoji" />
<property name="dist.dir" location="../dist/kuromoji" />
<property name="ipadic.version" value="mecab-ipadic-2.7.0-20070801" />
<property name="dict.src.file" value="${ipadic.version}.tar.gz" />
<!-- <property name="dict.url" value="http://atilika.com/releases/mecab-ipadic/${dict.src.file}" /> -->
<property name="dict.url" value="http://mecab.googlecode.com/files/${dict.src.file}"/>
<property name="dict.src.dir" value="${build.dir}/${ipadic.version}" />
<property name="dict.encoding" value="euc-jp"/>
<property name="dict.format" value="ipadic"/>
<property name="dict.normalize" value="false"/>
<property name="dict.target.dir" location="./src/resources"/>
<import file="../../../lucene/contrib/contrib-build.xml"/>
<available type="dir" file="${build.dir}/${ipadic.version}" property="dict.available"/>
<path id="classpath">
<pathelement path="${analyzers-common.jar}"/>
<path refid="base.classpath"/>
</path>
<target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
<target name="download-dict" unless="dict.available">
<get src="${dict.url}" dest="${build.dir}/${dict.src.file}"/>
<gunzip src="${build.dir}/${dict.src.file}"/>
<untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
</target>
<path id="tools.dependencies">
<fileset dir="../icu/lib" includes="icu4j-*.jar"/>
</path>
<path id="tools.classpath">
<path refid="classpath"/>
<path refid="tools.dependencies"/>
<pathelement location="${build.dir}/classes/java"/>
<pathelement location="${build.dir}/classes/tools"/>
</path>
<path id="tools.test.classpath">
<path refid="tools.classpath"/>
<path refid="test.base.classpath"/>
<pathelement location="${build.dir}/classes/tools-test"/>
</path>
<target name="build-dict" depends="compile-tools, download-dict">
<sequential>
<delete verbose="true">
<fileset dir="src/resources/org/apache/lucene/analysis/kuromoji/dict" includes="**/*"/>
</delete>
<java fork="true" failonerror="true" maxmemory="512m" classname="org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder">
<classpath>
<path refid="tools.classpath"/>
<pathelement path="${build.dir}/classes/tools"/>
</classpath>
<assertions>
<enable package="org.apache.lucene"/>
</assertions>
<arg value="${dict.format}"/>
<arg value="${dict.src.dir}"/>
<arg value="${dict.target.dir}"/>
<arg value="${dict.encoding}"/>
<arg value="${dict.normalize}"/>
</java>
</sequential>
</target>
<target name="compile-tools" depends="compile-core, common.compile-tools">
<compile
srcdir="src/tools/java"
destdir="${build.dir}/classes/tools">
<classpath>
<path refid="tools.classpath"/>
<pathelement path="src/tools/java"/>
</classpath>
</compile>
</target>
<target name="compile-tools-tests" depends="compile-tools">
<compile
srcdir="src/tools/test"
destdir="${build.dir}/classes/tools-test">
<classpath>
<path refid="tools.test.classpath"/>
<pathelement path="src/tools/test"/>
</classpath>
</compile>
</target>
<target name="test-tools" depends="compile-tools-tests">
<test-macro dataDir="src/tools/test" junit.classpath="tools.test.classpath"/>
</target>
<target name="compile-test" depends="contrib-build.compile-test, compile-tools-tests"/>
<target name="test" depends="contrib-build.test, test-tools"/>
</project>

View File

@ -0,0 +1,91 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
public class KuromojiAnalyzer extends StopwordAnalyzerBase {
private final Segmenter segmenter;
private final Set<String> stoptags;
public KuromojiAnalyzer(Version matchVersion) {
this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
}
public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, Set<?> stopwords, Set<String> stoptags) {
super(matchVersion, stopwords);
this.segmenter = segmenter;
this.stoptags = stoptags;
}
public static Set<?> getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
public static Set<String> getDefaultStopTags(){
return DefaultSetHolder.DEFAULT_STOP_TAGS;
}
/**
* Atomically loads DEFAULT_STOP_SET, DEFAULT_STOP_TAGS in a lazy fashion once the
* outer class accesses the static final set the first time.
*/
private static class DefaultSetHolder {
static final Set<?> DEFAULT_STOP_SET;
static final Set<String> DEFAULT_STOP_TAGS;
static {
try {
DEFAULT_STOP_SET = loadStopwordSet(false, KuromojiAnalyzer.class, "stopwords.txt", "#");
final CharArraySet tagset = loadStopwordSet(false, KuromojiAnalyzer.class, "stoptags.txt", "#");
DEFAULT_STOP_TAGS = new HashSet<String>();
for (Object element : tagset) {
char chars[] = (char[]) element;
DEFAULT_STOP_TAGS.add(new String(chars));
}
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
TokenStream stream = new LowerCaseFilter(matchVersion, tokenizer);
stream = new CJKWidthFilter(stream);
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
stream = new StopFilter(matchVersion, stream, stopwords);
stream = new KuromojiBaseFormFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}
}

View File

@ -0,0 +1,62 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.kuromoji.tokenattributes.BaseFormAttribute;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* Replaces term text with the {@link BaseFormAttribute}.
* <p>
* This acts as a lemmatizer for verbs and adjectives.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class KuromojiBaseFormFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
public KuromojiBaseFormFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (!keywordAtt.isKeyword()) {
String baseForm = basicFormAtt.getBaseForm();
if (baseForm != null) {
termAtt.setEmpty().append(basicFormAtt.getBaseForm());
}
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,44 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
* Removes tokens that match a set of POS tags.
*/
public final class KuromojiPartOfSpeechStopFilter extends FilteringTokenFilter {
private final Set<String> stopTags;
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
public KuromojiPartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
super(enablePositionIncrements, input);
this.stopTags = stopTags;
}
@Override
protected boolean accept() throws IOException {
final String pos = posAtt.getPartOfSpeech();
return pos == null || !stopTags.contains(pos);
}
}

View File

@ -0,0 +1,83 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.text.BreakIterator;
import java.util.List;
import java.util.Locale;
import org.apache.lucene.analysis.kuromoji.tokenattributes.BaseFormAttribute;
import org.apache.lucene.analysis.kuromoji.tokenattributes.InflectionAttribute;
import org.apache.lucene.analysis.kuromoji.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
public final class KuromojiTokenizer extends SegmentingTokenizerBase {
private static final BreakIterator proto = BreakIterator.getSentenceInstance(Locale.JAPAN);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final BaseFormAttribute basicFormAtt = addAttribute(BaseFormAttribute.class);
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
private final InflectionAttribute inflectionAtt = addAttribute(InflectionAttribute.class);
private final Segmenter segmenter;
private List<Token> tokens;
private int tokenIndex = 0;
private int sentenceStart = 0;
public KuromojiTokenizer(Reader input) {
this(new Segmenter(), input);
}
public KuromojiTokenizer(Segmenter segmenter, Reader input) {
super(input, (BreakIterator) proto.clone());
this.segmenter = segmenter;
}
@Override
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
this.sentenceStart = sentenceStart;
// TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us?
tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart, true);
tokenIndex = 0;
}
@Override
protected boolean incrementWord() {
if (tokenIndex == tokens.size()) {
return false;
}
Token token = tokens.get(tokenIndex);
int position = token.getPosition();
int length = token.getLength();
clearAttributes();
termAtt.copyBuffer(buffer, sentenceStart + position, length);
int startOffset = offset + sentenceStart + position;
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset+length));
basicFormAtt.setToken(token);
posAtt.setToken(token);
readingAtt.setToken(token);
inflectionAtt.setToken(token);
tokenIndex++;
return true;
}
}

View File

@ -0,0 +1,214 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import org.apache.lucene.analysis.kuromoji.dict.*;
import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
/**
* Tokenizer main class.
* Thread safe.
*/
public class Segmenter {
public static enum Mode {
NORMAL, SEARCH, EXTENDED
}
private final Viterbi viterbi;
private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
private final boolean split;
public Segmenter() {
this(null, Mode.NORMAL, false);
}
public Segmenter(UserDictionary userDictionary, Mode mode) {
this(userDictionary, mode, false);
}
public Segmenter(UserDictionary userDictionary) {
this(userDictionary, Mode.NORMAL, false);
}
public Segmenter(Mode mode) {
this(null, mode, false);
}
public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) {
final TokenInfoDictionary dict = TokenInfoDictionary.getInstance();
final UnknownDictionary unknownDict = UnknownDictionary.getInstance();
this.viterbi = new Viterbi(dict,
unknownDict,
ConnectionCosts.getInstance(),
userDictionary,
mode);
this.split = split;
dictionaryMap.put(Type.KNOWN, dict);
dictionaryMap.put(Type.UNKNOWN, unknownDict);
dictionaryMap.put(Type.USER, userDictionary);
}
/**
* Tokenize input text
* @param text
* @return list of Token
*/
public List<Token> tokenize(String text) {
if (!split) {
return doTokenize(0, text);
}
List<Integer> splitPositions = getSplitPositions(text);
if(splitPositions.size() == 0) {
return doTokenize(0, text);
}
ArrayList<Token> result = new ArrayList<Token>();
int offset = 0;
for(int position : splitPositions) {
result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
offset = position + 1;
}
if(offset < text.length()) {
result.addAll(doTokenize(offset, text.substring(offset)));
}
return result;
}
/**
* Split input text at 句読点, which is and
* @param text
* @return list of split position
*/
private List<Integer> getSplitPositions(String text) {
ArrayList<Integer> splitPositions = new ArrayList<Integer>();
int position = 0;
int currentPosition = 0;
while(true) {
int indexOfMaru = text.indexOf("", currentPosition);
int indexOfTen = text.indexOf("", currentPosition);
if(indexOfMaru < 0 || indexOfTen < 0) {
position = Math.max(indexOfMaru, indexOfTen);;
} else {
position = Math.min(indexOfMaru, indexOfTen);
}
if(position >= 0) {
splitPositions.add(position);
currentPosition = position + 1;
} else {
break;
}
}
return splitPositions;
}
private List<Token> doTokenize(int offset, String sentence) {
char text[] = sentence.toCharArray();
return doTokenize(offset, text, 0, text.length, false);
}
/**
* Tokenize input sentence.
* @param offset offset of sentence in original input text
* @param sentence sentence to tokenize
* @return list of Token
*/
public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
ArrayList<Token> result = new ArrayList<Token>();
ViterbiNode[][][] lattice;
try {
lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
} catch (IOException impossible) {
throw new RuntimeException(impossible);
}
List<ViterbiNode> bestPath = viterbi.search(lattice);
for (ViterbiNode node : bestPath) {
int wordId = node.getWordId();
if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS
continue;
} else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
continue; // Do not emit punctuation
}
Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
result.add(token);
}
return result;
}
/** returns a Graphviz String */
public String debugTokenize(String text) {
ViterbiNode[][][] lattice;
try {
lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
} catch (IOException impossible) {
throw new RuntimeException(impossible);
}
List<ViterbiNode> bestPath = this.viterbi.search(lattice);
return new GraphvizFormatter(ConnectionCosts.getInstance())
.format(lattice[0], lattice[1], bestPath);
}
static final boolean isPunctuation(char ch) {
switch(Character.getType(ch)) {
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.CONTROL:
case Character.FORMAT:
case Character.DASH_PUNCTUATION:
case Character.START_PUNCTUATION:
case Character.END_PUNCTUATION:
case Character.CONNECTOR_PUNCTUATION:
case Character.OTHER_PUNCTUATION:
case Character.MATH_SYMBOL:
case Character.CURRENCY_SYMBOL:
case Character.MODIFIER_SYMBOL:
case Character.OTHER_SYMBOL:
case Character.INITIAL_QUOTE_PUNCTUATION:
case Character.FINAL_QUOTE_PUNCTUATION:
return true;
default:
return false;
}
}
}

View File

@ -0,0 +1,147 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
public class Token {
private final Dictionary dictionary;
private final int wordId;
private final char[] surfaceForm;
private final int offset;
private final int length;
private final int position;
private final Type type;
public Token(int wordId, char[] surfaceForm, int offset, int length, Type type, int position, Dictionary dictionary) {
this.wordId = wordId;
this.surfaceForm = surfaceForm;
this.offset = offset;
this.length = length;
this.type = type;
this.position = position;
this.dictionary = dictionary;
}
/**
* @return surfaceForm
*/
public char[] getSurfaceForm() {
return surfaceForm;
}
/**
* @return offset into surfaceForm
*/
public int getOffset() {
return offset;
}
/**
* @return length of surfaceForm
*/
public int getLength() {
return length;
}
/**
* @return surfaceForm as a String
*/
public String getSurfaceFormString() {
return new String(surfaceForm, offset, length);
}
/**
* @return reading. null if token doesn't have reading.
*/
public String getReading() {
return dictionary.getReading(wordId);
}
/**
* @return pronunciation. null if token doesn't have pronunciation.
*/
public String getPronunciation() {
return dictionary.getPronunciation(wordId);
}
/**
* @return part of speech.
*/
public String getPartOfSpeech() {
return dictionary.getPartOfSpeech(wordId);
}
/**
* @return inflection type or null
*/
public String getInflectionType() {
return dictionary.getInflectionType(wordId);
}
/**
* @return inflection form or null
*/
public String getInflectionForm() {
return dictionary.getInflectionForm(wordId);
}
/**
* @return base form or null if token is not inflected
*/
public String getBaseForm() {
return dictionary.getBaseForm(wordId);
}
/**
* Returns true if this token is known word
* @return true if this token is in standard dictionary. false if not.
*/
public boolean isKnown() {
return type == Type.KNOWN;
}
/**
* Returns true if this token is unknown word
* @return true if this token is unknown word. false if not.
*/
public boolean isUnknown() {
return type == Type.UNKNOWN;
}
/**
* Returns true if this token is defined in user dictionary
* @return true if this token is in user dictionary. false if not.
*/
public boolean isUser() {
return type == Type.USER;
}
/**
* Get index of this token in input text
* @return position of token
*/
public int getPosition() {
return position;
}
}

View File

@ -0,0 +1,291 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IOUtils;
public abstract class BinaryDictionary implements Dictionary {
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
public static final String INFLDICT_FILENAME_SUFFIX = "$inflDict.dat";
public static final String DICT_HEADER = "kuromoji_dict";
public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
public static final String POSDICT_HEADER = "kuromoji_dict_pos";
public static final String INFLDICT_HEADER = "kuromoji_dict_infl";
public static final int VERSION = 1;
private final ByteBuffer buffer;
private final int[] targetMapOffsets, targetMap;
private final String[] posDict;
private final String[] inflTypeDict;
private final String[] inflFormDict;
protected BinaryDictionary() throws IOException {
InputStream mapIS = null, dictIS = null, posIS = null, inflIS = null;
IOException priorE = null;
int[] targetMapOffsets = null, targetMap = null;
String[] posDict = null;
String[] inflFormDict = null;
String[] inflTypeDict = null;
ByteBuffer buffer = null;
try {
mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
mapIS = new BufferedInputStream(mapIS);
DataInput in = new InputStreamDataInput(mapIS);
CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
targetMap = new int[in.readVInt()];
targetMapOffsets = new int[in.readVInt()];
int accum = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMap.length; ofs++) {
final int val = in.readVInt();
if ((val & 0x01) != 0) {
targetMapOffsets[sourceId] = ofs;
sourceId++;
}
accum += val >>> 1;
targetMap[ofs] = accum;
}
if (sourceId + 1 != targetMapOffsets.length)
throw new IOException("targetMap file format broken");
targetMapOffsets[sourceId] = targetMap.length;
mapIS.close(); mapIS = null;
posIS = getResource(POSDICT_FILENAME_SUFFIX);
posIS = new BufferedInputStream(posIS);
in = new InputStreamDataInput(posIS);
CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
posDict = new String[in.readVInt()];
for (int j = 0; j < posDict.length; j++) {
posDict[j] = in.readString();
}
posIS.close(); posIS = null;
inflIS = getResource(INFLDICT_FILENAME_SUFFIX);
inflIS = new BufferedInputStream(inflIS);
in = new InputStreamDataInput(inflIS);
CodecUtil.checkHeader(in, INFLDICT_HEADER, VERSION, VERSION);
int length = in.readVInt();
inflTypeDict = new String[length];
inflFormDict = new String[length];
for (int j = 0; j < length; j++) {
inflTypeDict[j] = in.readString();
inflFormDict[j] = in.readString();
}
inflIS.close(); inflIS = null;
dictIS = getResource(DICT_FILENAME_SUFFIX);
// no buffering here, as we load in one large buffer
in = new InputStreamDataInput(dictIS);
CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
final int size = in.readVInt();
final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
final ReadableByteChannel channel = Channels.newChannel(dictIS);
final int read = channel.read(tmpBuffer);
if (read != size) {
throw new EOFException("Cannot read whole dictionary");
}
dictIS.close(); dictIS = null;
buffer = tmpBuffer.asReadOnlyBuffer();
} catch (IOException ioe) {
priorE = ioe;
} finally {
IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, inflIS, dictIS);
}
this.targetMap = targetMap;
this.targetMapOffsets = targetMapOffsets;
this.posDict = posDict;
this.inflTypeDict = inflTypeDict;
this.inflFormDict = inflFormDict;
this.buffer = buffer;
}
protected final InputStream getResource(String suffix) throws IOException {
return getClassResource(getClass(), suffix);
}
// util, reused by ConnectionCosts and CharacterDefinition
public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
if (is == null)
throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix);
return is;
}
public void lookupWordIds(int sourceId, IntsRef ref) {
ref.ints = targetMap;
ref.offset = targetMapOffsets[sourceId];
// targetMapOffsets always has one more entry pointing behind last:
ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
}
@Override
public int getLeftId(int wordId) {
return buffer.getShort(wordId);
}
@Override
public int getRightId(int wordId) {
return buffer.getShort(wordId + 2); // Skip left id
}
@Override
public int getWordCost(int wordId) {
return buffer.getShort(wordId + 4); // Skip left id and right id
}
@Override
public String getBaseForm(int wordId) {
int offset = baseFormOffset(wordId);
int length = (buffer.get(offset++) & 0xff) >>> 1;
if (length == 0) {
return null; // same as surface form
} else {
return readString(offset, length, false);
}
}
@Override
public String getReading(int wordId) {
int offset = readingOffset(wordId);
int readingData = buffer.get(offset++) & 0xff;
return readString(offset, readingData >>> 1, (readingData & 1) == 1);
}
@Override
public String getPartOfSpeech(int wordId) {
int posIndex = buffer.get(posOffset(wordId)) & 0xff; // read index into posDict
return posDict[posIndex >>> 1];
}
@Override
public String getPronunciation(int wordId) {
if (hasPronunciationData(wordId)) {
int offset = pronunciationOffset(wordId);
int pronunciationData = buffer.get(offset++) & 0xff;
return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
} else {
return getReading(wordId); // same as the reading
}
}
@Override
public String getInflectionType(int wordId) {
int index = getInflectionIndex(wordId);
return index < 0 ? null : inflTypeDict[index];
}
@Override
public String getInflectionForm(int wordId) {
int index = getInflectionIndex(wordId);
return index < 0 ? null : inflFormDict[index];
}
private static int posOffset(int wordId) {
return wordId + 6;
}
private static int baseFormOffset(int wordId) {
return wordId + 7;
}
private int readingOffset(int wordId) {
int offset = baseFormOffset(wordId);
int baseFormLength = buffer.get(offset++) & 0xfe; // mask away pronunciation bit
return offset + baseFormLength;
}
private int pronunciationOffset(int wordId) {
int offset = readingOffset(wordId);
int readingData = buffer.get(offset++) & 0xff;
final int readingLength;
if ((readingData & 1) == 0) {
readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
} else {
readingLength = readingData >>> 1;
}
return offset + readingLength;
}
private boolean hasPronunciationData(int wordId) {
int baseFormData = buffer.get(baseFormOffset(wordId)) & 0xff;
return (baseFormData & 1) == 0;
}
private boolean hasInflectionData(int wordId) {
int posData = buffer.get(posOffset(wordId)) & 0xff;
return (posData & 1) == 1;
}
private int getInflectionIndex(int wordId) {
if (!hasInflectionData(wordId)) {
return -1; // common case: no inflection data
}
// skip past reading/pronunciation at the end
int offset = hasPronunciationData(wordId) ? pronunciationOffset(wordId) : readingOffset(wordId);
int endData = buffer.get(offset++) & 0xff;
final int endLength;
if ((endData & 1) == 0) {
endLength = endData & 0xfe; // UTF-16: mask off kana bit
} else {
endLength = endData >>> 1;
}
offset += endLength;
byte b = buffer.get(offset++);
int i = b & 0x7F;
if ((b & 0x80) == 0) return i;
b = buffer.get(offset++);
i |= (b & 0x7F) << 7;
assert ((b & 0x80) == 0);
return i;
}
private String readString(int offset, int length, boolean kana) {
char text[] = new char[length];
if (kana) {
for (int i = 0; i < length; i++) {
text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff));
}
} else {
for (int i = 0; i < length; i++) {
text[i] = buffer.getChar(offset + (i << 1));
}
}
return new String(text);
}
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IOUtils;
public final class CharacterDefinition {
public static final String FILENAME_SUFFIX = ".dat";
public static final String HEADER = "kuromoji_cd";
public static final int VERSION = 1;
public static final int CLASS_COUNT = CharacterClass.values().length;
// only used internally for lookup:
private static enum CharacterClass {
NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, KANJINUMERIC;
}
private final byte[] characterCategoryMap = new byte[0x10000];
private final boolean[] invokeMap = new boolean[CLASS_COUNT];
private final boolean[] groupMap = new boolean[CLASS_COUNT];
// the classes:
public static final byte NGRAM = (byte) CharacterClass.NGRAM.ordinal();
public static final byte DEFAULT = (byte) CharacterClass.DEFAULT.ordinal();
public static final byte SPACE = (byte) CharacterClass.SPACE.ordinal();
public static final byte SYMBOL = (byte) CharacterClass.SYMBOL.ordinal();
public static final byte NUMERIC = (byte) CharacterClass.NUMERIC.ordinal();
public static final byte ALPHA = (byte) CharacterClass.ALPHA.ordinal();
public static final byte CYRILLIC = (byte) CharacterClass.CYRILLIC.ordinal();
public static final byte GREEK = (byte) CharacterClass.GREEK.ordinal();
public static final byte HIRAGANA = (byte) CharacterClass.HIRAGANA.ordinal();
public static final byte KATAKANA = (byte) CharacterClass.KATAKANA.ordinal();
public static final byte KANJI = (byte) CharacterClass.KANJI.ordinal();
public static final byte KANJINUMERIC = (byte) CharacterClass.KANJINUMERIC.ordinal();
private CharacterDefinition() throws IOException {
IOException priorE = null;
InputStream is = null;
try {
is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
is = new BufferedInputStream(is);
final DataInput in = new InputStreamDataInput(is);
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
in.readBytes(characterCategoryMap, 0, characterCategoryMap.length);
for (int i = 0; i < CLASS_COUNT; i++) {
final byte b = in.readByte();
invokeMap[i] = (b & 0x01) != 0;
groupMap[i] = (b & 0x02) != 0;
}
} catch (IOException ioe) {
priorE = ioe;
} finally {
IOUtils.closeWhileHandlingException(priorE, is);
}
}
public byte getCharacterClass(char c) {
return characterCategoryMap[c];
}
public boolean isInvoke(char c) {
return invokeMap[characterCategoryMap[c]];
}
public boolean isGroup(char c) {
return groupMap[characterCategoryMap[c]];
}
public boolean isKanji(char c) {
final byte characterClass = characterCategoryMap[c];
return characterClass == KANJI || characterClass == KANJINUMERIC;
}
public static byte lookupCharacterClass(String characterClassName) {
return (byte) CharacterClass.valueOf(characterClassName).ordinal();
}
public static CharacterDefinition getInstance() {
return SingletonHolder.INSTANCE;
}
private static class SingletonHolder {
static final CharacterDefinition INSTANCE;
static {
try {
INSTANCE = new CharacterDefinition();
} catch (IOException ioe) {
throw new RuntimeException("Cannot load CharacterDefinition.", ioe);
}
}
}
}

View File

@ -0,0 +1,86 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IOUtils;
public final class ConnectionCosts {
public static final String FILENAME_SUFFIX = ".dat";
public static final String HEADER = "kuromoji_cc";
public static final int VERSION = 1;
private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
private ConnectionCosts() throws IOException {
IOException priorE = null;
InputStream is = null;
short[][] costs = null;
try {
is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
is = new BufferedInputStream(is);
final DataInput in = new InputStreamDataInput(is);
CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
int forwardSize = in.readVInt();
int backwardSize = in.readVInt();
costs = new short[backwardSize][forwardSize];
int accum = 0;
for (int j = 0; j < costs.length; j++) {
final short[] a = costs[j];
for (int i = 0; i < a.length; i++) {
int raw = in.readVInt();
accum += (raw >>> 1) ^ -(raw & 1);
a[i] = (short)accum;
}
}
} catch (IOException ioe) {
priorE = ioe;
} finally {
IOUtils.closeWhileHandlingException(priorE, is);
}
this.costs = costs;
}
public int get(int forwardId, int backwardId) {
return costs[backwardId][forwardId];
}
public static ConnectionCosts getInstance() {
return SingletonHolder.INSTANCE;
}
private static class SingletonHolder {
static final ConnectionCosts INSTANCE;
static {
try {
INSTANCE = new ConnectionCosts();
} catch (IOException ioe) {
throw new RuntimeException("Cannot load ConnectionCosts.", ioe);
}
}
}
}

View File

@ -0,0 +1,88 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public interface Dictionary {
public static final String INTERNAL_SEPARATOR = "\u0000";
/**
* Get left id of specified word
* @param wordId
* @return left id
*/
public int getLeftId(int wordId);
/**
* Get right id of specified word
* @param wordId
* @return left id
*/
public int getRightId(int wordId);
/**
* Get word cost of specified word
* @param wordId
* @return left id
*/
public int getWordCost(int wordId);
/**
* Get Part-Of-Speech of tokens
* @param wordId word ID of token
* @return Part-Of-Speech of the token
*/
public String getPartOfSpeech(int wordId);
/**
* Get reading of tokens
* @param wordId word ID of token
* @return Reading of the token
*/
public String getReading(int wordId);
/**
* Get base form of word
* @param wordId word ID of token
* @return Base form (only different for inflected words, otherwise null)
*/
public String getBaseForm(int wordId);
/**
* Get pronunciation of tokens
* @param wordId word ID of token
* @return Pronunciation of the token
*/
public String getPronunciation(int wordId);
/**
* Get inflection type of tokens
* @param wordId word ID of token
* @return inflection type, or null
*/
public String getInflectionType(int wordId);
/**
* Get inflection form of tokens
* @param wordId word ID of token
* @return inflection form, or null
*/
public String getInflectionForm(int wordId);
// TODO: maybe we should have a optimal method, a non-typesafe
// 'getAdditionalData' if other dictionaries like unidic have additional data
}

View File

@ -0,0 +1,72 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
public final class TokenInfoDictionary extends BinaryDictionary {
public static final String FST_FILENAME_SUFFIX = "$fst.dat";
private final TokenInfoFST fst;
private TokenInfoDictionary() throws IOException {
super();
IOException priorE = null;
InputStream is = null;
FST<Long> fst = null;
try {
is = getResource(FST_FILENAME_SUFFIX);
is = new BufferedInputStream(is);
fst = new FST<Long>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton(true));
} catch (IOException ioe) {
priorE = ioe;
} finally {
IOUtils.closeWhileHandlingException(priorE, is);
}
// TODO: some way to configure?
this.fst = new TokenInfoFST(fst, true);
}
public TokenInfoFST getFST() {
return fst;
}
public static TokenInfoDictionary getInstance() {
return SingletonHolder.INSTANCE;
}
private static class SingletonHolder {
static final TokenInfoDictionary INSTANCE;
static {
try {
INSTANCE = new TokenInfoDictionary();
} catch (IOException ioe) {
throw new RuntimeException("Cannot load TokenInfoDictionary.", ioe);
}
}
}
}

View File

@ -0,0 +1,82 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
public final class TokenInfoFST {
private final FST<Long> fst;
// depending upon fasterButMoreRam, we cache root arcs for either
// kana (0x3040-0x30FF) or kana + han (0x3040-0x9FFF)
// false: 191 arcs
// true: 28,607 arcs (costs ~1.5MB)
private final int cacheCeiling;
private final FST.Arc<Long> rootCache[];
public final Long NO_OUTPUT;
public TokenInfoFST(FST<Long> fst, boolean fasterButMoreRam) throws IOException {
this.fst = fst;
this.cacheCeiling = fasterButMoreRam ? 0x9FFF : 0x30FF;
NO_OUTPUT = fst.outputs.getNoOutput();
rootCache = cacheRootArcs();
}
@SuppressWarnings("unchecked")
private FST.Arc<Long>[] cacheRootArcs() throws IOException {
FST.Arc<Long> rootCache[] = new FST.Arc[1+(cacheCeiling-0x3040)];
FST.Arc<Long> firstArc = new FST.Arc<Long>();
fst.getFirstArc(firstArc);
FST.Arc<Long> arc = new FST.Arc<Long>();
// TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs)
for (int i = 0; i < rootCache.length; i++) {
if (fst.findTargetArc(0x3040 + i, firstArc, arc) != null) {
rootCache[i] = new FST.Arc<Long>().copyFrom(arc);
}
}
return rootCache;
}
public FST.Arc<Long> findTargetArc(int ch, FST.Arc<Long> follow, FST.Arc<Long> arc, boolean useCache) throws IOException {
if (useCache && ch >= 0x3040 && ch <= cacheCeiling) {
assert ch != FST.END_LABEL;
final Arc<Long> result = rootCache[ch - 0x3040];
if (result == null) {
return null;
} else {
arc.copyFrom(result);
return arc;
}
} else {
return fst.findTargetArc(ch, follow, arc);
}
}
public Arc<Long> getFirstArc(FST.Arc<Long> arc) {
return fst.getFirstArc(arc);
}
/** @lucene.internal for testing only */
FST<Long> getInternalFST() {
return fst;
}
}

View File

@ -0,0 +1,83 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
public final class UnknownDictionary extends BinaryDictionary {
private final CharacterDefinition characterDefinition = CharacterDefinition.getInstance();
private UnknownDictionary() throws IOException {
super();
}
public int lookup(char[] text, int offset, int len) {
if(!characterDefinition.isGroup(text[offset])) {
return 1;
}
// Extract unknown word. Characters with the same character class are considered to be part of unknown word
byte characterIdOfFirstCharacter = characterDefinition.getCharacterClass(text[offset]);
int length = 1;
for (int i = 1; i < len; i++) {
if (characterIdOfFirstCharacter == characterDefinition.getCharacterClass(text[offset+i])){
length++;
} else {
break;
}
}
return length;
}
public CharacterDefinition getCharacterDefinition() {
return characterDefinition;
}
@Override
public String getReading(int wordId) {
return null;
}
@Override
public String getInflectionType(int wordId) {
return null;
}
@Override
public String getInflectionForm(int wordId) {
return null;
}
public static UnknownDictionary getInstance() {
return SingletonHolder.INSTANCE;
}
private static class SingletonHolder {
static final UnknownDictionary INSTANCE;
static {
try {
INSTANCE = new UnknownDictionary();
} catch (IOException ioe) {
throw new RuntimeException("Cannot load UnknownDictionary.", ioe);
}
}
}
}

View File

@ -0,0 +1,258 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
public final class UserDictionary implements Dictionary {
// phrase text -> phrase ID
private final TokenInfoFST fst;
// holds wordid, length, length... indexed by phrase ID
private final int segmentations[][];
// holds readings and POS, indexed by wordid
private final String data[];
private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
public static final int WORD_COST = -100000;
public static final int LEFT_ID = 5;
public static final int RIGHT_ID = 5;
public UserDictionary(Reader reader) throws IOException {
BufferedReader br = new BufferedReader(reader);
String line = null;
int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
List<String[]> featureEntries = new ArrayList<String[]>();
// text, segmentation, readings, POS
while ((line = br.readLine()) != null) {
// Remove comments
line = line.replaceAll("#.*$", "");
// Skip empty lines or comment lines
if (line.trim().length() == 0) {
continue;
}
String[] values = CSVUtil.parse(line);
featureEntries.add(values);
}
// TODO: should we allow multiple segmentations per input 'phrase'?
// the old treemap didn't support this either, and i'm not sure if its needed/useful?
Collections.sort(featureEntries, new Comparator<String[]>() {
@Override
public int compare(String[] left, String[] right) {
return left[0].compareTo(right[0]);
}
});
List<String> data = new ArrayList<String>(featureEntries.size());
List<int[]> segmentations = new ArrayList<int[]>(featureEntries.size());
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRef scratch = new IntsRef();
long ord = 0;
for (String[] values : featureEntries) {
String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
String[] readings = values[2].replaceAll(" *", " ").split(" ");
String pos = values[3];
if (segmentation.length != readings.length) {
// FIXME: Should probably deal with this differently. Exception?
System.out.println("This entry is not properly formatted : " + line);
}
int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
wordIdAndLength[0] = wordId;
for (int i = 0; i < segmentation.length; i++) {
wordIdAndLength[i + 1] = segmentation[i].length();
data.add(readings[i] + INTERNAL_SEPARATOR + pos);
wordId++;
}
// add mapping to FST
String token = values[0];
scratch.grow(token.length());
scratch.length = token.length();
for (int i = 0; i < token.length(); i++) {
scratch.ints[i] = (int) token.charAt(i);
}
fstBuilder.add(scratch, fstOutput.get(ord));
segmentations.add(wordIdAndLength);
ord++;
}
this.fst = new TokenInfoFST(fstBuilder.finish(), false);
this.data = data.toArray(new String[data.size()]);
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
}
/**
* Lookup words in text
* @param chars text
* @param off offset into text
* @param len length of text
* @return array of {wordId, position, length}
*/
public int[][] lookup(char[] chars, int off, int len) throws IOException {
// TODO: can we avoid this treemap/toIndexArray?
TreeMap<Integer, int[]> result = new TreeMap<Integer, int[]>(); // index, [length, length...]
boolean found = false; // true if we found any results
FST.Arc<Long> arc = new FST.Arc<Long>();
int end = off + len;
for (int startOffset = off; startOffset < end; startOffset++) {
arc = fst.getFirstArc(arc);
int output = 0;
int remaining = end - startOffset;
for (int i = 0; i < remaining; i++) {
int ch = chars[startOffset+i];
if (fst.findTargetArc(ch, arc, arc, i == 0) == null) {
break; // continue to next position
}
output += arc.output.intValue();
if (arc.isFinal()) {
output += arc.nextFinalOutput.intValue();
result.put(startOffset-off, segmentations[output]);
found = true;
}
}
}
return found ? toIndexArray(result) : EMPTY_RESULT;
}
private static final int[][] EMPTY_RESULT = new int[0][];
/**
* Convert Map of index and wordIdAndLength to array of {wordId, index, length}
* @param input
* @return array of {wordId, index, length}
*/
private int[][] toIndexArray(Map<Integer, int[]> input) {
ArrayList<int[]> result = new ArrayList<int[]>();
for (int i : input.keySet()) {
int[] wordIdAndLength = input.get(i);
int wordId = wordIdAndLength[0];
// convert length to index
int current = i;
for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
result.add(token);
current += wordIdAndLength[j];
}
}
return result.toArray(new int[result.size()][]);
}
@Override
public int getLeftId(int wordId) {
return LEFT_ID;
}
@Override
public int getRightId(int wordId) {
return RIGHT_ID;
}
@Override
public int getWordCost(int wordId) {
return WORD_COST;
}
@Override
public String getReading(int wordId) {
return getFeature(wordId, 0);
}
@Override
public String getPartOfSpeech(int wordId) {
return getFeature(wordId, 1);
}
@Override
public String getBaseForm(int wordId) {
return null; // TODO: add support?
}
@Override
public String getPronunciation(int wordId) {
return null; // TODO: add support?
}
@Override
public String getInflectionType(int wordId) {
return null; // TODO: add support?
}
@Override
public String getInflectionForm(int wordId) {
return null; // TODO: add support?
}
private String[] getAllFeaturesArray(int wordId) {
String allFeatures = data[wordId-CUSTOM_DICTIONARY_WORD_ID_OFFSET];
if(allFeatures == null) {
return null;
}
return allFeatures.split(INTERNAL_SEPARATOR);
}
private String getFeature(int wordId, int... fields) {
String[] allFeatures = getAllFeaturesArray(wordId);
if (allFeatures == null) {
return null;
}
StringBuilder sb = new StringBuilder();
if (fields.length == 0) { // All features
for (String feature : allFeatures) {
sb.append(CSVUtil.quoteEscape(feature)).append(",");
}
} else if (fields.length == 1) { // One feature doesn't need to escape value
sb.append(allFeatures[fields[0]]).append(",");
} else {
for (int field : fields){
sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
}
}
return sb.deleteCharAt(sb.length() - 1).toString();
}
}

View File

@ -0,0 +1,32 @@
package org.apache.lucene.analysis.kuromoji.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.Token;
import org.apache.lucene.util.Attribute;
/**
* Attribute for {@link Token#getBaseForm()}.
* <p>
* Note: depending on part of speech, this value may not be applicable,
* and will be null.
*/
public interface BaseFormAttribute extends Attribute {
public String getBaseForm();
public void setToken(Token token);
}

View File

@ -0,0 +1,50 @@
package org.apache.lucene.analysis.kuromoji.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.Token;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
public class BaseFormAttributeImpl extends AttributeImpl implements BaseFormAttribute, Cloneable {
private Token token;
public String getBaseForm() {
return token == null ? null : token.getBaseForm();
}
public void setToken(Token token) {
this.token = token;
}
@Override
public void clear() {
token = null;
}
@Override
public void copyTo(AttributeImpl target) {
BaseFormAttribute t = (BaseFormAttribute) target;
t.setToken(token);
}
@Override
public void reflectWith(AttributeReflector reflector) {
reflector.reflect(BaseFormAttribute.class, "baseForm", getBaseForm());
}
}

View File

@ -0,0 +1,33 @@
package org.apache.lucene.analysis.kuromoji.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.Token;
import org.apache.lucene.util.Attribute;
/**
* Attribute for Kuromoji inflection data.
* <p>
* Note: in some cases this value may not be applicable,
* and will be null.
*/
public interface InflectionAttribute extends Attribute {
public String getInflectionType();
public String getInflectionForm();
public void setToken(Token token);
}

View File

@ -0,0 +1,62 @@
package org.apache.lucene.analysis.kuromoji.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.Token;
import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
public class InflectionAttributeImpl extends AttributeImpl implements InflectionAttribute, Cloneable {
private Token token;
public String getInflectionType() {
return token == null ? null : token.getInflectionType();
}
public String getInflectionForm() {
return token == null ? null : token.getInflectionForm();
}
public void setToken(Token token) {
this.token = token;
}
@Override
public void clear() {
token = null;
}
@Override
public void copyTo(AttributeImpl target) {
InflectionAttribute t = (InflectionAttribute) target;
t.setToken(token);
}
@Override
public void reflectWith(AttributeReflector reflector) {
String type = getInflectionType();
String typeEN = type == null ? null : ToStringUtil.getInflectionTypeTranslation(type);
reflector.reflect(InflectionAttribute.class, "inflectionType", type);
reflector.reflect(InflectionAttribute.class, "inflectionType (en)", typeEN);
String form = getInflectionForm();
String formEN = form == null ? null : ToStringUtil.getInflectedFormTranslation(form);
reflector.reflect(InflectionAttribute.class, "inflectionForm", form);
reflector.reflect(InflectionAttribute.class, "inflectionForm (en)", formEN);
}
}

View File

@ -0,0 +1,29 @@
package org.apache.lucene.analysis.kuromoji.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.Token;
import org.apache.lucene.util.Attribute;
/**
* Attribute for {@link Token#getPartOfSpeech()}.
*/
public interface PartOfSpeechAttribute extends Attribute {
public String getPartOfSpeech();
public void setToken(Token token);
}

View File

@ -0,0 +1,54 @@
package org.apache.lucene.analysis.kuromoji.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.Token;
import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
public class PartOfSpeechAttributeImpl extends AttributeImpl implements PartOfSpeechAttribute, Cloneable {
private Token token;
public String getPartOfSpeech() {
return token == null ? null : token.getPartOfSpeech();
}
public void setToken(Token token) {
this.token = token;
}
@Override
public void clear() {
token = null;
}
@Override
public void copyTo(AttributeImpl target) {
PartOfSpeechAttribute t = (PartOfSpeechAttribute) target;
t.setToken(token);
}
@Override
public void reflectWith(AttributeReflector reflector) {
String partOfSpeech = getPartOfSpeech();
String partOfSpeechEN = partOfSpeech == null ? null : ToStringUtil.getPOSTranslation(partOfSpeech);
reflector.reflect(PartOfSpeechAttribute.class, "partOfSpeech", partOfSpeech);
reflector.reflect(PartOfSpeechAttribute.class, "partOfSpeech (en)", partOfSpeechEN);
}
}

View File

@ -0,0 +1,33 @@
package org.apache.lucene.analysis.kuromoji.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.Token;
import org.apache.lucene.util.Attribute;
/**
* Attribute for Kuromoji reading data
* <p>
* Note: in some cases this value may not be applicable,
* and will be null.
*/
public interface ReadingAttribute extends Attribute {
public String getReading();
public String getPronunciation();
public void setToken(Token token);
}

View File

@ -0,0 +1,62 @@
package org.apache.lucene.analysis.kuromoji.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.Token;
import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
public class ReadingAttributeImpl extends AttributeImpl implements ReadingAttribute, Cloneable {
private Token token;
public String getReading() {
return token == null ? null : token.getReading();
}
public String getPronunciation() {
return token == null ? null : token.getPronunciation();
}
public void setToken(Token token) {
this.token = token;
}
@Override
public void clear() {
token = null;
}
@Override
public void copyTo(AttributeImpl target) {
ReadingAttribute t = (ReadingAttribute) target;
t.setToken(token);
}
@Override
public void reflectWith(AttributeReflector reflector) {
String reading = getReading();
String readingEN = reading == null ? null : ToStringUtil.getRomanization(reading);
String pronunciation = getPronunciation();
String pronunciationEN = pronunciation == null ? null : ToStringUtil.getRomanization(pronunciation);
reflector.reflect(ReadingAttribute.class, "reading", reading);
reflector.reflect(ReadingAttribute.class, "reading (en)", readingEN);
reflector.reflect(ReadingAttribute.class, "pronunciation", pronunciation);
reflector.reflect(ReadingAttribute.class, "pronunciation (en)", pronunciationEN);
}
}

View File

@ -0,0 +1,102 @@
package org.apache.lucene.analysis.kuromoji.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public final class CSVUtil {
private static final char QUOTE = '"';
private static final char COMMA = ',';
private static final Pattern QUOTE_REPLACE_PATTERN = Pattern.compile("^\"([^\"]+)\"$");
private static final String ESCAPED_QUOTE = "\"\"";
private CSVUtil() {} // no instance!!!
/**
* Parse CSV line
* @param line
* @return Array of values
*/
public static String[] parse(String line) {
boolean insideQuote = false;
ArrayList<String> result = new ArrayList<String>();
int quoteCount = 0;
StringBuilder sb = new StringBuilder();
for(int i = 0; i < line.length(); i++) {
char c = line.charAt(i);
if(c == QUOTE) {
insideQuote = !insideQuote;
quoteCount++;
}
if(c == COMMA && !insideQuote) {
String value = sb.toString();
value = unQuoteUnEscape(value);
result.add(value);
sb = new StringBuilder();
continue;
}
sb.append(c);
}
result.add(sb.toString());
// Validate
if(quoteCount % 2 != 0) {
return new String[0];
}
return result.toArray(new String[result.size()]);
}
private static String unQuoteUnEscape(String original) {
String result = original;
// Unquote
Matcher m = QUOTE_REPLACE_PATTERN.matcher(original);
if(m.matches()) {
result = m.group(1);
}
// Unescape
result = result.replaceAll(ESCAPED_QUOTE, "\"");
return result;
}
/**
* Quote and escape input value for CSV
* @param original
*/
public static String quoteEscape(String original) {
String result = original.replaceAll("\"", ESCAPED_QUOTE);
if(result.indexOf(COMMA) >= 0) {
result = "\"" + result + "\"";
}
return result;
}
}

View File

@ -0,0 +1,226 @@
package org.apache.lucene.analysis.kuromoji.viterbi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
public class GraphvizFormatter {
private final static String BOS_LABEL = "BOS";
private final static String EOS_LABEL = "EOS";
private final static String FONT_NAME = "Helvetica";
private ConnectionCosts costs;
private Map<String, ViterbiNode> nodeMap;
private Map<String, String> bestPathMap;
private boolean foundBOS;
public GraphvizFormatter(ConnectionCosts costs) {
this.costs = costs;
this.nodeMap = new HashMap<String, ViterbiNode>();
this.bestPathMap = new HashMap<String, String>();
}
public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
initBestPathMap(null);
StringBuilder sb = new StringBuilder();
sb.append(formatHeader());
sb.append(formatNodes(startsArray, endsArray));
sb.append(formatTrailer());
return sb.toString();
}
public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List<ViterbiNode> bestPath) {
// List<ViterbiNode> bestPathWithBOSAndEOS = new ArrayList<ViterbiNode>(bastPath);
initBestPathMap(bestPath);
StringBuilder sb = new StringBuilder();
sb.append(formatHeader());
sb.append(formatNodes(startsArray, endsArray));
sb.append(formatTrailer());
return sb.toString();
}
private void initBestPathMap(List<ViterbiNode> bestPath) {
this.bestPathMap.clear();
if (bestPath == null){
return;
}
for (int i = 0; i < bestPath.size() - 1; i++) {
ViterbiNode from = bestPath.get(i);
ViterbiNode to = bestPath.get(i + 1);
String fromId = getNodeId(from);
String toId = getNodeId(to);
assert this.bestPathMap.containsKey(fromId) == false;
assert this.bestPathMap.containsValue(toId) == false;
this.bestPathMap.put(fromId, toId);
}
}
private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
this.nodeMap.clear();
this.foundBOS = false;
StringBuilder sb = new StringBuilder();
for (int i = 1; i < endsArray.length; i++) {
if(endsArray[i] == null || startsArray[i] == null) {
continue;
}
for (int j = 0; j < endsArray[i].length; j++) {
ViterbiNode from = endsArray[i][j];
if(from == null){
continue;
}
sb.append(formatNodeIfNew(from));
for (int k = 0; k < startsArray[i].length; k++) {
ViterbiNode to = startsArray[i][k];
if(to == null){
break;
}
sb.append(formatNodeIfNew(to));
sb.append(formatEdge(from, to));
}
}
}
return sb.toString();
}
private String formatNodeIfNew(ViterbiNode node) {
String nodeId = getNodeId(node);
if (! this.nodeMap.containsKey(nodeId)) {
this.nodeMap.put(nodeId, node);
return formatNode(node);
} else {
return "";
}
}
private String formatHeader() {
StringBuilder sb = new StringBuilder();
sb.append("digraph viterbi {\n");
sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n");
sb.append("# A2 paper size\n");
sb.append("size = \"34.4,16.5\";\n");
sb.append("# try to fill paper\n");
sb.append("ratio = fill;\n");
sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
return sb.toString();
}
private String formatTrailer() {
return "}";
}
private String formatEdge(ViterbiNode from, ViterbiNode to) {
if (this.bestPathMap.containsKey(getNodeId(from)) &&
this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) {
return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 ");
} else {
return formatEdge(from, to, "");
}
}
private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) {
StringBuilder sb = new StringBuilder();
sb.append(getNodeId(from));
sb.append(" -> ");
sb.append(getNodeId(to));
sb.append(" [ ");
sb.append("label=\"");
sb.append(getCost(from, to));
sb.append("\"");
sb.append(" ");
sb.append(attributes);
sb.append(" ");
sb.append(" ]");
sb.append("\n");
return sb.toString();
}
private String formatNode(ViterbiNode node) {
StringBuilder sb = new StringBuilder();
sb.append("\"");
sb.append(getNodeId(node));
sb.append("\"");
sb.append(" [ ");
sb.append("label=");
sb.append(formatNodeLabel(node));
sb.append(" ]");
return sb.toString();
}
private String formatNodeLabel(ViterbiNode node) {
StringBuilder sb = new StringBuilder();
sb.append("<<table border=\"0\" cellborder=\"0\">");
sb.append("<tr><td>");
sb.append(getNodeLabel(node));
sb.append("</td></tr>");
sb.append("<tr><td>");
sb.append("<font color=\"blue\">");
sb.append(node.getWordCost());
sb.append("</font>");
sb.append("</td></tr>");
// sb.append("<tr><td>");
// sb.append(this.dictionary.get(node.getWordId()).getPosInfo());
// sb.append("</td></tr>");
sb.append("</table>>");
return sb.toString();
}
private String getNodeId(ViterbiNode node) {
return String.valueOf(node.hashCode());
}
private String getNodeLabel(ViterbiNode node) {
if (node.getType() == Type.KNOWN && node.getWordId() == 0) {
if (this.foundBOS) {
return EOS_LABEL;
} else {
this.foundBOS = true;
return BOS_LABEL;
}
} else {
return node.getSurfaceFormString();
}
}
private int getCost(ViterbiNode from, ViterbiNode to) {
return this.costs.get(from.getLeftId(), to.getRightId());
}
}

View File

@ -0,0 +1,361 @@
package org.apache.lucene.analysis.kuromoji.viterbi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST;
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST;
public class Viterbi {
private final TokenInfoFST fst;
private final TokenInfoDictionary dictionary;
private final UnknownDictionary unkDictionary;
private final ConnectionCosts costs;
private final UserDictionary userDictionary;
private final CharacterDefinition characterDefinition;
private final boolean useUserDictionary;
private final boolean searchMode;
private final boolean extendedMode;
private static final int DEFAULT_COST = 10000000;
private static final int SEARCH_MODE_LENGTH_KANJI = 3;
private static final int SEARCH_MODE_LENGTH = 7;
private static final int SEARCH_MODE_PENALTY = 10000;
private static final char[] BOS = "BOS".toCharArray();
private static final char[] EOS = "EOS".toCharArray();
/**
* Constructor
*/
public Viterbi(TokenInfoDictionary dictionary,
UnknownDictionary unkDictionary,
ConnectionCosts costs,
UserDictionary userDictionary,
Mode mode) {
this.dictionary = dictionary;
this.fst = dictionary.getFST();
this.unkDictionary = unkDictionary;
this.costs = costs;
this.userDictionary = userDictionary;
if(userDictionary == null) {
this.useUserDictionary = false;
} else {
this.useUserDictionary = true;
}
switch(mode){
case SEARCH:
searchMode = true;
extendedMode = false;
break;
case EXTENDED:
searchMode = true;
extendedMode = true;
break;
default:
searchMode = false;
extendedMode = false;
break;
}
this.characterDefinition = unkDictionary.getCharacterDefinition();
}
/**
* Find best path from input lattice.
* @param lattice the result of build method
* @return List of ViterbiNode which consist best path
*/
public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
ViterbiNode[][] startIndexArr = lattice[0];
ViterbiNode[][] endIndexArr = lattice[1];
for (int i = 1; i < startIndexArr.length; i++){
if (startIndexArr[i] == null || endIndexArr[i] == null){ // continue since no array which contains ViterbiNodes exists. Or no previous node exists.
continue;
}
for (ViterbiNode node : startIndexArr[i]) {
if (node == null){ // If array doesn't contain ViterbiNode any more, continue to next index
break;
}
int backwardConnectionId = node.getLeftId();
int wordCost = node.getWordCost();
int leastPathCost = DEFAULT_COST;
for (ViterbiNode leftNode : endIndexArr[i]) {
if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
break;
}
int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost; // cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
// "Search mode". Add extra costs if it is long node.
if (searchMode) {
// System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
char[] surfaceForm = node.getSurfaceForm();
int offset = node.getOffset();
int length = node.getLength();
if (length > SEARCH_MODE_LENGTH_KANJI) {
boolean allKanji = true;
// check if node consists of only kanji
for (int pos = 0; pos < length; pos++) {
if (!characterDefinition.isKanji(surfaceForm[offset+pos])){
allKanji = false;
break;
}
}
if (allKanji) { // Process only Kanji keywords
pathCost += (length - SEARCH_MODE_LENGTH_KANJI) * SEARCH_MODE_PENALTY;
} else if (length > SEARCH_MODE_LENGTH) {
pathCost += (length - SEARCH_MODE_LENGTH) * SEARCH_MODE_PENALTY;
}
}
}
if (pathCost < leastPathCost){ // If total cost is lower than before, set current previous node as best left node (previous means left).
leastPathCost = pathCost;
node.setPathCost(leastPathCost);
node.setLeftNode(leftNode);
}
}
}
}
// track best path
ViterbiNode node = endIndexArr[0][0]; // EOS
LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
result.add(node);
while (true) {
ViterbiNode leftNode = node.getLeftNode();
if (leftNode == null) {
break;
}
// EXTENDED mode convert unknown word into unigram node
if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
byte unigramWordId = CharacterDefinition.NGRAM;
int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
char[] surfaceForm = leftNode.getSurfaceForm();
int offset = leftNode.getOffset();
int length = leftNode.getLength();
for (int i = length - 1; i >= 0; i--) {
int charLen = 1;
if (i > 0 && Character.isLowSurrogate(surfaceForm[offset+i])) {
i--;
charLen = 2;
}
ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm, offset + i, charLen, unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i, Type.UNKNOWN);
result.addFirst(uniGramNode);
}
} else {
result.addFirst(leftNode);
}
node = leftNode;
}
return result;
}
/**
* Build lattice from input text
* @param text
*/
public ViterbiNode[][][] build(char text[], int offset, int length) throws IOException {
ViterbiNode[][] startIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS
ViterbiNode[][] endIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS
int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr
int[] endSizeArr = new int[length + 2]; // array to keep ViterbiNode count in endIndexArr
FST.Arc<Long> arc = new FST.Arc<Long>();
ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
// Process user dictionary;
if (useUserDictionary) {
processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
int unknownWordEndIndex = -1; // index of the last character of unknown word
final IntsRef wordIdRef = new IntsRef();
for (int startIndex = 0; startIndex < length; startIndex++) {
// If no token ends where current token starts, skip this index
if (endSizeArr[startIndex + 1] == 0) {
continue;
}
int suffixStart = offset + startIndex;
int suffixLength = length - startIndex;
boolean found = false;
arc = fst.getFirstArc(arc);
int output = 0;
for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
int ch = text[suffixStart + endIndex - 1];
if (fst.findTargetArc(ch, arc, arc, endIndex == 1) == null) {
break; // continue to next position
}
output += arc.output.intValue();
if (arc.isFinal()) {
output += arc.nextFinalOutput.intValue();
found = true; // Don't produce unknown word starting from this index
dictionary.lookupWordIds(output, wordIdRef);
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, endIndex, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
}
}
// In the case of normal mode, it doesn't process unknown word greedily.
if(!searchMode && unknownWordEndIndex > startIndex){
continue;
}
// Process Unknown Word: hmm what is this isInvoke logic (same no matter what)
int unknownWordLength = 0;
char firstCharacter = text[suffixStart];
boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
if (isInvoke){ // Process "invoke"
unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
} else if (found == false){ // Process not "invoke"
unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
}
if (unknownWordLength > 0) { // found unknown word
final int characterId = characterDefinition.getCharacterClass(firstCharacter);
unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, unknownWordLength, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
unknownWordEndIndex = startIndex + unknownWordLength;
}
}
ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
return result;
}
/**
* Find token(s) in input text and set found token(s) in arrays as normal tokens
* @param text
* @param startIndexArr
* @param endIndexArr
* @param startSizeArr
* @param endSizeArr
*/
private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) throws IOException {
int[][] result = userDictionary.lookup(text, offset, len);
for(int[] segmentation : result) {
int wordId = segmentation[0];
int index = segmentation[1];
int length = segmentation[2];
ViterbiNode node = new ViterbiNode(wordId, text, offset + index, length, userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
}
/**
* Add node to arrays and increment count in size array
* @param node
* @param startIndex
* @param endIndex
* @param startIndexArr
* @param endIndexArr
* @param startSizeArr
* @param endSizeArr
*/
private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
int startNodesCount = startSizeArr[startIndex];
int endNodesCount = endSizeArr[endIndex];
if (startNodesCount == 0) {
startIndexArr[startIndex] = new ViterbiNode[10];
}
if (endNodesCount == 0) {
endIndexArr[endIndex] = new ViterbiNode[10];
}
if (startIndexArr[startIndex].length <= startNodesCount){
startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
}
if (endIndexArr[endIndex].length <= endNodesCount){
endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
}
startIndexArr[startIndex][startNodesCount] = node;
endIndexArr[endIndex][endNodesCount] = node;
startSizeArr[startIndex] = startNodesCount + 1;
endSizeArr[endIndex] = endNodesCount + 1;
}
/**
* Return twice as big array which contains value of input array
* @param array
* @return
*/
private ViterbiNode[] extendArray(ViterbiNode[] array) {
//extend array
ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
}
}

View File

@ -0,0 +1,147 @@
package org.apache.lucene.analysis.kuromoji.viterbi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public final class ViterbiNode {
public enum Type {
KNOWN,
UNKNOWN,
USER
}
private final int wordId;
private final char[] surfaceForm;
private final int offset;
private final int length;
private final int leftId;
private final int rightId;
/** word cost for this node */
private final int wordCost;
/** minimum path cost found thus far */
private int pathCost;
private ViterbiNode leftNode;
private final Type type;
private final int startIndex;
public ViterbiNode(int wordId, char[] surfaceForm, int offset, int length, int leftId, int rightId, int wordCost, int startIndex, Type type) {
this.wordId = wordId;
this.surfaceForm = surfaceForm;
this.offset = offset;
this.length = length;
this.leftId = leftId;
this.rightId = rightId;
this.wordCost = wordCost;
this.startIndex = startIndex;
this.type = type;
}
/**
* @return the wordId
*/
public int getWordId() {
return wordId;
}
/**
* @return the surfaceForm
*/
public char[] getSurfaceForm() {
return surfaceForm;
}
/**
* @return start offset into surfaceForm
*/
public int getOffset() {
return offset;
}
/**
* @return length of surfaceForm
*/
public int getLength() {
return length;
}
/**
* @return the surfaceForm as a String
*/
public String getSurfaceFormString() {
return new String(surfaceForm, offset, length);
}
/**
* @return the leftId
*/
public int getLeftId() {
return leftId;
}
/**
* @return the rightId
*/
public int getRightId() {
return rightId;
}
/**
* @return the cost
*/
public int getWordCost() {
return wordCost;
}
/**
* @return the cost
*/
public int getPathCost() {
return pathCost;
}
/**
* param cost minimum path cost found this far
*/
public void setPathCost(int pathCost) {
this.pathCost = pathCost;
}
public void setLeftNode(ViterbiNode node) {
leftNode = node;
}
public ViterbiNode getLeftNode() {
return leftNode;
}
public int getStartIndex() {
return startIndex;
}
public Type getType() {
return type;
}
}

View File

@ -0,0 +1,26 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<title>
analyzers-kuromoji
</title>
</head>
<body>
analyzers-kuromoji
</body>
</html>

View File

@ -0,0 +1,410 @@
# set of default stop tags:
# uncomment a part of speech to treat those words as stopwords.
# the entire tagset is provided here for convenience.
#
#####
# noun: unclassified nouns
#名詞
#
# noun-common: Common nouns or nouns where the sub-classification is undefined
#名詞-一般
#
# noun-proper: Proper nouns where the sub-classification is undefined
#名詞-固有名詞
#
# noun-proper-misc: miscellaneous proper nouns
#名詞-固有名詞-一般
#
# noun-proper-person: Personal names where the sub-classification is undefined
#名詞-固有名詞-人名
#
# noun-proper-person-misc: names that cannot be divided into surname and
# given name; foreign names; names where the surname or given name is unknown.
# e.g. お市の方
#名詞-固有名詞-人名-一般
#
# noun-proper-person-surname: Mainly Japanese surnames.
# e.g. 山田
#名詞-固有名詞-人名-姓
#
# noun-proper-person-given_name: Mainly Japanese given names.
# e.g. 太郎
#名詞-固有名詞-人名-名
#
# noun-proper-organization: Names representing organizations.
# e.g. 通産省, NHK
#名詞-固有名詞-組織
#
# noun-proper-place: Place names where the sub-classification is undefined
#名詞-固有名詞-地域
#
# noun-proper-place-misc: Place names excluding countries.
# e.g. アジア, バルセロナ, 京都
#名詞-固有名詞-地域-一般
#
# noun-proper-place-country: Country names.
# e.g. 日本, オーストラリア
#名詞-固有名詞-地域-国
#
# noun-pronoun: Pronouns where the sub-classification is undefined
#名詞-代名詞
#
# noun-pronoun-misc: miscellaneous pronouns:
# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ
#名詞-代名詞-一般
#
# noun-pronoun-contraction: Spoken language contraction made by combining a
# pronoun and the particle 'wa'.
# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ
#名詞-代名詞-縮約
#
# noun-adverbial: Temporal nouns such as names of days or months that behave
# like adverbs. Nouns that represent amount or ratios and can be used adverbially,
# e.g. 金曜, 一月, 午後, 少量
#名詞-副詞可能
#
# noun-verbal: Nouns that take arguments with case and can appear followed by
# 'suru' and related verbs (する, できる, なさる, くださる)
# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り
#名詞-サ変接続
#
# noun-adjective-base: The base form of adjectives, words that appear before な ("na")
# e.g. 健康, 安易, 駄目, だめ
#名詞-形容動詞語幹
#
# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数.
# e.g. 0, 1, 2, 何, 数, 幾
#名詞-数
#
# noun-affix: noun affixes where the sub-classification is undefined
#名詞-非自立
#
# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that
# attach to the base form of inflectional words, words that cannot be classified
# into any of the other categories below. This category includes indefinite nouns.
# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第,
# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み,
# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳,
# わり, 割り, 割, ん-口語/, もん-口語/
#名詞-非自立-一般
#
# noun-affix-adverbial: noun affixes that that can behave as adverbs.
# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ,
# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか,
# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所,
# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま,
# 儘, 侭, みぎり, 矢先
#名詞-非自立-副詞可能
#
# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars
# with the stem よう(だ) ("you(da)").
# e.g. よう, やう, 様 (よう)
#名詞-非自立-助動詞語幹
#
# noun-affix-adjective-base: noun affixes that can connect to the indeclinable
# connection form な (aux "da").
# e.g. みたい, ふう
#名詞-非自立-形容動詞語幹
#
# noun-special: special nouns where the sub-classification is undefined.
#名詞-特殊
#
# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is
# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base
# form of inflectional words.
# e.g. そう
#名詞-特殊-助動詞語幹
#
# noun-suffix: noun suffixes where the sub-classification is undefined.
#名詞-接尾
#
# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect
# to ガル or タイ and can combine into compound nouns, words that cannot be classified into
# any of the other categories below. In general, this category is more inclusive than
# 接尾語 ("suffix") and is usually the last element in a compound noun.
# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み,
# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用
#名詞-接尾-一般
#
# noun-suffix-person: Suffixes that form nouns and attach to person names more often
# than other nouns.
# e.g. 君, 様, 著
#名詞-接尾-人名
#
# noun-suffix-place: Suffixes that form nouns and attach to place names more often
# than other nouns.
# e.g. 町, 市, 県
#名詞-接尾-地域
#
# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that
# can appear before スル ("suru").
# e.g. 化, 視, 分け, 入り, 落ち, 買い
#名詞-接尾-サ変接続
#
# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions,
# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the
# conjunctive form of inflectional words.
# e.g. そう
#名詞-接尾-助動詞語幹
#
# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive
# form of inflectional words and appear before the copula だ ("da").
# e.g. 的, げ, がち
#名詞-接尾-形容動詞語幹
#
# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ)
#名詞-接尾-副詞可能
#
# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category
# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach
# to numbers.
# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半
#名詞-接尾-助数詞
#
# noun-suffix-special: Special suffixes that mainly attach to inflecting words.
# e.g. (楽し) さ, (考え) 方
#名詞-接尾-特殊
#
# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words
# together.
# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦)
#名詞-接続詞的
#
# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are
# semantically verb-like.
# e.g. ごらん, ご覧, 御覧, 頂戴
#名詞-動詞非自立的
#
# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry,
# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation")
# is いわく ("iwaku").
#名詞-引用文字列
#
# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and
# behave like an adjective.
# e.g. 申し訳, 仕方, とんでも, 違い
#名詞-ナイ形容詞語幹
#
#####
# prefix: unclassified prefixes
接頭詞
#
# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms)
# excluding numerical expressions.
# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派)
接頭詞-名詞接続
#
# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
# in conjunctive form followed by なる/なさる/くださる.
# e.g. お (読みなさい), お (座り)
接頭詞-動詞接続
#
# prefix-adjectival: Prefixes that attach to adjectives.
# e.g. お (寒いですねえ), バカ (でかい)
接頭詞-形容詞接続
#
# prefix-numerical: Prefixes that attach to numerical expressions.
# e.g. 約, およそ, 毎時
接頭詞-数接続
#
#####
# verb: unclassified verbs
#動詞
#
# verb-main:
#動詞-自立
#
# verb-auxiliary:
動詞-非自立
#
# verb-suffix:
#動詞-接尾
#
#####
# adjective: unclassified adjectives
#形容詞
#
# adjective-main:
#形容詞-自立
#
# adjective-auxiliary:
#形容詞-非自立
#
# adjective-suffix:
#形容詞-接尾
#
#####
# adverb: unclassified adverbs
#副詞
#
# adverb-misc: Words that can be segmented into one unit and where adnominal
# modification is not possible.
# e.g. あいかわらず, 多分
#副詞-一般
#
# adverb-particle_conjunction: Adverbs that can be followed by の, は, に,
# な, する, だ, etc.
# e.g. こんなに, そんなに, あんなに, なにか, なんでも
#副詞-助詞類接続
#
#####
# adnominal: Words that only have noun-modifying forms.
# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう,
# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした,
# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き
#連体詞
#
#####
# conjunction: Conjunctions that can occur independently.
# e.g. が, けれども, そして, じゃあ, それどころか
接続詞
#
#####
# particle: unclassified particles.
助詞
#
# particle-case: case particles where the subclassification is undefined.
助詞-格助詞
#
# particle-case-misc: Case particles.
# e.g. から, が, で, と, に, へ, より, を, の, にて
助詞-格助詞-一般
#
# particle-case-quote: the "to" that appears after nouns, a persons speech,
# quotation marks, expressions of decisions from a meeting, reasons, judgements,
# conjectures, etc.
# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...)
助詞-格助詞-引用
#
# particle-case-compound: Compounds of particles and verbs that mainly behave
# like case particles.
# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って,
# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける,
# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し,
# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして,
# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって,
# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る,
# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる,
# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ
助詞-格助詞-連語
#
# particle-conjunctive:
# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども,
# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/,
# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/
助詞-接続助詞
#
# particle-dependency:
# e.g. こそ, さえ, しか, すら, は, も, ぞ
助詞-係助詞
#
# particle-adverbial:
# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/,
# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/,
# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに,
# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/,
# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」)
助詞-副助詞
#
# particle-interjective: particles with interjective grammatical roles.
# e.g. (松島) や
助詞-間投助詞
#
# particle-coordinate:
# e.g. と, たり, だの, だり, とか, なり, や, やら
助詞-並立助詞
#
# particle-final:
# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ,
# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/
助詞-終助詞
#
# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is
# adverbial, conjunctive, or sentence final. For example:
# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」
# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」
# 「(祈りが届いたせい) か (, 試験に合格した.)」
# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」
# e.g. か
助詞-副助詞/並立助詞/終助詞
#
# particle-adnominalizer: The "no" that attaches to nouns and modifies
# non-inflectional words.
助詞-連体化
#
# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs
# that are giongo, giseigo, or gitaigo.
# e.g. に, と
助詞-副詞化
#
# particle-special: A particle that does not fit into one of the above classifications.
# This includes particles that are used in Tanka, Haiku, and other poetry.
# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家)
助詞-特殊
#
#####
# auxiliary-verb:
助動詞
#
#####
# interjection: Greetings and other exclamations.
# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます,
# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい
感動詞
#
#####
# symbol: unclassified Symbols.
#記号
#
# symbol-misc: A general symbol not in one of the categories below.
# e.g. [○◎@$〒→+]
記号-一般
#
# symbol-comma: Commas
# e.g. [,、]
記号-読点
#
# symbol-period: Periods and full stops.
# e.g. [..。]
記号-句点
#
# symbol-space: Full-width whitespace.
記号-空白
#
# symbol-open_bracket:
# e.g. [({‘“『【]
記号-括弧開
#
# symbol-close_bracket:
# e.g. [)}’”』」】]
記号-括弧閉
#
# symbol-alphabetic:
#記号-アルファベット
#
#####
# other: unclassified other
#その他
#
# other-interjection: Words that are hard to classify as noun-suffixes or
# sentence-final particles.
# e.g. (だ)ァ
その他-間投
#
#####
# filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
# e.g. あの, うんと, えと
フィラー
#
#####
# non-verbal: non-verbal sound.
非言語音
#
#####
# fragment:
#語断片
#
#####
# unknown: unknown part of speech.
#未知語

View File

@ -0,0 +1,13 @@
# short set of japanese stopwords
いう
する
人物
さま
すること
ため
もの
おいて
なる
できる
おく
ある

View File

@ -0,0 +1,231 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.List;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
public class SegmenterTest extends LuceneTestCase {
private static Segmenter segmenter;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
segmenter = new Segmenter();
}
@AfterClass
public static void afterClass() throws Exception {
segmenter = null;
}
@Test
public void testSegmentation() {
// Skip tests for Michelle Kwan -- UniDic segments Kwan as ワン
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
// String[] surfaceForms = {
// "ミシェル", "", "クワン", "", "優勝", "", "まし", "", "",
// "スペース", "ステーション", "", "行き", "ます", "",
// "うたがわしい", ""
// };
String input = "スペースステーションに行きます。うたがわしい。";
String[] surfaceForms = {
"スペース", "ステーション", "", "行き", "ます", "",
"うたがわしい", ""
};
List<Token> tokens = segmenter.tokenize(input);
assertTrue(tokens.size() == surfaceForms.length);
for (int i = 0; i < tokens.size(); i++) {
assertEquals(surfaceForms[i], tokens.get(i).getSurfaceFormString());
}
}
@Test
public void testReadings() {
List<Token> tokens = segmenter.tokenize("寿司が食べたいです。");
assertEquals(6, tokens.size());
assertEquals("スシ", tokens.get(0).getReading());
assertEquals("", tokens.get(1).getReading());
assertEquals("タベ", tokens.get(2).getReading());
assertEquals("タイ", tokens.get(3).getReading());
assertEquals("デス", tokens.get(4).getReading());
assertEquals("", tokens.get(5).getReading());
}
@Test
public void testReadings2() {
List<Token> tokens = segmenter.tokenize("多くの学生が試験に落ちた。");
assertEquals(9, tokens.size());
assertEquals("オオク", tokens.get(0).getReading());
assertEquals("", tokens.get(1).getReading());
assertEquals("ガクセイ", tokens.get(2).getReading());
assertEquals("", tokens.get(3).getReading());
assertEquals("シケン", tokens.get(4).getReading());
assertEquals("", tokens.get(5).getReading());
assertEquals("オチ", tokens.get(6).getReading());
assertEquals("", tokens.get(7).getReading());
assertEquals("", tokens.get(8).getReading());
}
@Test
public void testPronunciations() {
List<Token> tokens = segmenter.tokenize("寿司が食べたいです。");
assertEquals(6, tokens.size());
assertEquals("スシ", tokens.get(0).getPronunciation());
assertEquals("", tokens.get(1).getPronunciation());
assertEquals("タベ", tokens.get(2).getPronunciation());
assertEquals("タイ", tokens.get(3).getPronunciation());
assertEquals("デス", tokens.get(4).getPronunciation());
assertEquals("", tokens.get(5).getPronunciation());
}
@Test
public void testPronunciations2() {
List<Token> tokens = segmenter.tokenize("多くの学生が試験に落ちた。");
assertEquals(9, tokens.size());
// pronunciation differs from reading here
assertEquals("オーク", tokens.get(0).getPronunciation());
assertEquals("", tokens.get(1).getPronunciation());
assertEquals("ガクセイ", tokens.get(2).getPronunciation());
assertEquals("", tokens.get(3).getPronunciation());
assertEquals("シケン", tokens.get(4).getPronunciation());
assertEquals("", tokens.get(5).getPronunciation());
assertEquals("オチ", tokens.get(6).getPronunciation());
assertEquals("", tokens.get(7).getPronunciation());
assertEquals("", tokens.get(8).getPronunciation());
}
@Test
public void testBasicForms() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertNull(tokens.get(0).getBaseForm());
assertNull(tokens.get(1).getBaseForm());
assertNull(tokens.get(2).getBaseForm());
assertNull(tokens.get(3).getBaseForm());
assertNull(tokens.get(4).getBaseForm());
assertNull(tokens.get(5).getBaseForm());
assertEquals(tokens.get(6).getBaseForm(), "ある");
assertNull(tokens.get(7).getBaseForm());
assertNull(tokens.get(8).getBaseForm());
}
@Test
public void testInflectionTypes() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertNull(tokens.get(0).getInflectionType());
assertNull(tokens.get(1).getInflectionType());
assertNull(tokens.get(2).getInflectionType());
assertNull(tokens.get(3).getInflectionType());
assertNull(tokens.get(4).getInflectionType());
assertNull(tokens.get(5).getInflectionType());
assertEquals("五段・ラ行", tokens.get(6).getInflectionType());
assertEquals("特殊・マス", tokens.get(7).getInflectionType());
assertNull(tokens.get(8).getInflectionType());
}
@Test
public void testInflectionForms() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertNull(tokens.get(0).getInflectionForm());
assertNull(tokens.get(1).getInflectionForm());
assertNull(tokens.get(2).getInflectionForm());
assertNull(tokens.get(3).getInflectionForm());
assertNull(tokens.get(4).getInflectionForm());
assertNull(tokens.get(5).getInflectionForm());
assertEquals("連用形", tokens.get(6).getInflectionForm());
assertEquals("基本形", tokens.get(7).getInflectionForm());
assertNull(tokens.get(8).getInflectionForm());
}
@Test
public void testPartOfSpeech() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertEquals("名詞-代名詞-一般", tokens.get(0).getPartOfSpeech());
assertEquals("助詞-係助詞", tokens.get(1).getPartOfSpeech());
assertEquals("副詞-助詞類接続", tokens.get(2).getPartOfSpeech());
assertEquals("名詞-サ変接続", tokens.get(3).getPartOfSpeech());
assertEquals("名詞-一般", tokens.get(4).getPartOfSpeech());
assertEquals("助詞-格助詞-一般", tokens.get(5).getPartOfSpeech());
assertEquals("動詞-自立", tokens.get(6).getPartOfSpeech());
assertEquals("助動詞", tokens.get(7).getPartOfSpeech());
assertEquals("記号-句点", tokens.get(8).getPartOfSpeech());
}
// TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
// do we have a possibility to actually lookup the first and last word from dictionary?
public void testYabottai() {
List<Token> tokens = segmenter.tokenize("やぼったい");
assertEquals(1, tokens.size());
assertEquals("やぼったい", tokens.get(0).getSurfaceFormString());
}
public void testTsukitosha() {
List<Token> tokens = segmenter.tokenize("突き通しゃ");
assertEquals(1, tokens.size());
assertEquals("突き通しゃ", tokens.get(0).getSurfaceFormString());
}
public void testBocchan() throws Exception {
doTestBocchan(1);
}
@Test @Nightly
public void testBocchanBig() throws Exception {
doTestBocchan(100);
}
private void doTestBocchan(int numIterations) throws Exception {
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
this.getClass().getResourceAsStream("bocchan.utf-8")));
String line = reader.readLine();
reader.close();
if (VERBOSE) {
System.out.println("Test for Bocchan without pre-splitting sentences");
}
long totalStart = System.currentTimeMillis();
for (int i = 0; i < numIterations; i++){
segmenter.tokenize(line);
}
if (VERBOSE) {
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
System.out.println("Test for Bocchan with pre-splitting sentences");
}
String[] sentences = line.split("、|。");
totalStart = System.currentTimeMillis();
for (int i = 0; i < numIterations; i++) {
for (String sentence: sentences) {
segmenter.tokenize(sentence);
}
}
if (VERBOSE) {
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
}
}
}

View File

@ -0,0 +1,63 @@
package org.apache.lucene.analysis.kuromoji;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestExtendedMode extends BaseTokenStreamTestCase {
private final Segmenter segmenter = new Segmenter(Mode.EXTENDED);
private final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
/** simple test for supplementary characters */
public void testSurrogates() throws IOException {
assertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛",
new String[] { "𩬅", "", "", "", "", "" });
}
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
int numIterations = atLeast(10000);
for (int i = 0; i < numIterations; i++) {
String s = _TestUtil.randomUnicodeString(random, 100);
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
}
}
}

View File

@ -0,0 +1,51 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
* stopwords file is missing in classpath */
public void testResourcesAvailable() {
new KuromojiAnalyzer(TEST_VERSION_CURRENT);
}
/**
* An example sentence, test removal of particles, etc by POS,
* lemmatization with the basic form, and that position increments
* and offsets are correct.
*/
public void testBasics() throws IOException {
assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "多くの学生が試験に落ちた。",
new String[] { "多く", "学生", "試験", "落ちる" },
new int[] { 0, 3, 6, 9 },
new int[] { 2, 5, 8, 11 },
new int[] { 1, 2, 2, 2 }
);
}
/**
* blast random strings against the analyzer
*/
public void testRandom() throws IOException {
checkRandomData(random, new KuromojiAnalyzer(TEST_VERSION_CURRENT), atLeast(10000));
}
}

View File

@ -0,0 +1,50 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
public class TestKuromojiBaseFormFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(reader);
return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
}
};
public void testBasics() throws IOException {
assertAnalyzesTo(analyzer, "それはまだ実験段階にあります",
new String[] { "それ", "", "まだ", "実験", "段階", "", "ある", "ます" }
);
}
public void testEnglish() throws IOException {
assertAnalyzesTo(analyzer, "this atest",
new String[] { "this", "atest" });
}
public void testRandomStrings() throws IOException {
checkRandomData(random, analyzer, atLeast(10000));
}
}

View File

@ -0,0 +1,150 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
public void testDecomposition1() throws Exception {
assertAnalyzesTo(analyzer, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
"アメリカ低所得者医療援助制度が、今日では、その予算の約3分の1を老人に費やしている。",
new String[] { "本来", "", "貧困", "", "", "女性", "", "子供", "", "医療", "保護", "",
"提供", "する", "ため", "", "創設", "", "", "", "制度", "", "ある", "アメリカ",
"", "所得", "", "医療", "援助", "制度", "", "今日", "", "", "その",
"予算", "", "", "", "分の", "", "", "老人", "", "費やし", "", "いる" },
new int[] { 0, 2, 4, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30,
31, 33, 34, 37, 41, 42, 44, 45, 47, 49, 51, 53, 55, 56, 58, 60,
62, 63, 64, 65, 67, 68, 69, 71, 72, 75, 76 },
new int[] { 2, 3, 6, 7, 8, 10, 11, 13, 14, 16, 18, 19, 21, 23, 25, 26, 28, 29, 30, 31,
33, 34, 36, 41, 42, 44, 45, 47, 49, 51, 52, 55, 56, 57, 60, 62,
63, 64, 65, 67, 68, 69, 71, 72, 75, 76, 78 }
);
}
public void testDecomposition2() throws Exception {
assertAnalyzesTo(analyzer, "麻薬の密売は根こそぎ絶やさなければならない",
new String[] { "麻薬", "", "密売", "", "根こそぎ", "絶やさ", "なけれ", "", "なら", "ない" },
new int[] { 0, 2, 3, 5, 6, 10, 13, 16, 17, 19 },
new int[] { 2, 3, 5, 6, 10, 13, 16, 17, 19, 21 }
);
}
public void testDecomposition3() throws Exception {
assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。",
new String[] { "魔女", "", "大将", "マシュー", "ホプキンス" },
new int[] { 0, 2, 3, 5, 10 },
new int[] { 2, 3, 5, 9, 15 }
);
}
public void testDecomposition4() throws Exception {
assertAnalyzesTo(analyzer, "これは本ではない",
new String[] { "これ", "", "", "", "", "ない" },
new int[] { 0, 2, 3, 4, 5, 6 },
new int[] { 2, 3, 4, 5, 6, 8 }
);
}
public void testDecomposition5() throws Exception {
assertAnalyzesTo(analyzer, "くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ",
new String[] { "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ" },
new int[] { 0, 4, 8, 12, 16, 20, 24, 28, 32, 36},
new int[] { 4, 8, 12, 16, 20, 24, 28, 32, 36, 40 }
);
}
/** Tests that sentence offset is incorporated into the resulting offsets */
public void testTwoSentences() throws Exception {
assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
new String[] { "魔女", "", "大将", "マシュー", "ホプキンス", "魔女", "", "大将", "マシュー", "ホプキンス" },
new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}
public void testLargeDocReliability() throws Exception {
for (int i = 0; i < 100; i++) {
String s = _TestUtil.randomUnicodeString(random, 10000);
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
ts.reset();
while (ts.incrementToken()) {
}
}
}
/** simple test for supplementary characters */
public void testSurrogates() throws IOException {
assertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛",
new String[] { "𩬅", "", "", "", "", "" });
}
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
int numIterations = atLeast(10000);
for (int i = 0; i < numIterations; i++) {
String s = _TestUtil.randomUnicodeString(random, 100);
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
}
}
// note: test is kinda silly since kuromoji emits punctuation tokens.
// but, when/if we filter these out it will be useful.
public void testEnd() throws Exception {
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない")),
new String[] { "これ", "", "", "", "", "ない" },
new int[] { 0, 2, 3, 4, 5, 6 },
new int[] { 2, 3, 4, 5, 6, 8 },
new Integer(8)
);
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない ")),
new String[] { "これ", "", "", "", "", "ない" },
new int[] { 0, 2, 3, 4, 5, 6, 8 },
new int[] { 2, 3, 4, 5, 6, 8, 9 },
new Integer(12)
);
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,107 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.util.ToStringUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntsRefFSTEnum;
import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput;
public class TestTokenInfoDictionary extends LuceneTestCase {
/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
// just for debugging
int numTerms = 0;
int numWords = 0;
int lastWordId = -1;
int lastSourceId = -1;
TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
ConnectionCosts matrix = ConnectionCosts.getInstance();
FST<Long> fst = tid.getFST().getInternalFST();
IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
InputOutput<Long> mapping;
IntsRef scratch = new IntsRef();
while ((mapping = fstEnum.next()) != null) {
numTerms++;
IntsRef input = mapping.input;
char chars[] = new char[input.length];
for (int i = 0; i < chars.length; i++) {
chars[i] = (char)input.ints[input.offset+i];
}
assertTrue(UnicodeUtil.validUTF16String(new String(chars)));
Long output = mapping.output;
int sourceId = output.intValue();
// we walk in order, terms, sourceIds, and wordIds should always be increasing
assertTrue(sourceId > lastSourceId);
lastSourceId = sourceId;
tid.lookupWordIds(sourceId, scratch);
for (int i = 0; i < scratch.length; i++) {
numWords++;
int wordId = scratch.ints[scratch.offset+i];
assertTrue(wordId > lastWordId);
lastWordId = wordId;
String baseForm = tid.getBaseForm(wordId);
assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
String inflectionForm = tid.getInflectionForm(wordId);
assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
if (inflectionForm != null) {
// check that its actually an ipadic inflection form
assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
}
String inflectionType = tid.getInflectionType(wordId);
assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
if (inflectionType != null) {
// check that its actually an ipadic inflection type
assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
}
int leftId = tid.getLeftId(wordId);
int rightId = tid.getRightId(wordId);
matrix.get(rightId, leftId);
tid.getWordCost(wordId);
String pos = tid.getPartOfSpeech(wordId);
assertNotNull(pos);
assertTrue(UnicodeUtil.validUTF16String(pos));
// check that its actually an ipadic pos tag
assertNotNull(ToStringUtil.getPOSTranslation(pos));
String pronunciation = tid.getPronunciation(wordId);
assertNotNull(pronunciation);
assertTrue(UnicodeUtil.validUTF16String(pronunciation));
String reading = tid.getReading(wordId);
assertNotNull(reading);
assertTrue(UnicodeUtil.validUTF16String(reading));
}
}
if (VERBOSE) {
System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
}
}
}

View File

@ -0,0 +1,98 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.IOException;
import org.apache.lucene.analysis.kuromoji.SegmenterTest;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class UserDictionaryTest extends LuceneTestCase {
private UserDictionary readDict() throws IOException {
InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt");
if (is == null)
throw new FileNotFoundException("Cannot find userdict.txt in test classpath!");
try {
Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
return new UserDictionary(reader);
} finally {
is.close();
}
}
@Test
public void testLookup() throws IOException {
UserDictionary dictionary = readDict();
String s = "関西国際空港に行った";
int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
// Length should be three 関西, 国際, 空港
assertEquals(3, dictionaryEntryResult.length);
// Test positions
assertEquals(0, dictionaryEntryResult[0][1]); // index of 関西
assertEquals(2, dictionaryEntryResult[1][1]); // index of 国際
assertEquals(4, dictionaryEntryResult[2][1]); // index of 空港
// Test lengths
assertEquals(2, dictionaryEntryResult[0][2]); // length of 関西
assertEquals(2, dictionaryEntryResult[1][2]); // length of 国際
assertEquals(2, dictionaryEntryResult[2][2]); // length of 空港
s = "関西国際空港と関西国際空港に行った";
int[][] dictionaryEntryResult2 = dictionary.lookup(s.toCharArray(), 0, s.length());
// Length should be six
assertEquals(6, dictionaryEntryResult2.length);
}
@Test
public void testReadings() throws IOException {
UserDictionary dictionary = readDict();
int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
assertEquals("ニホン", dictionary.getReading(wordIdNihon));
result = dictionary.lookup("朝青龍".toCharArray(), 0, 3);
assertEquals(1, result.length);
int wordIdAsashoryu = result[0][0]; // wordId for 朝青龍
assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu));
}
@Test
public void testPartOfSpeech() throws IOException {
UserDictionary dictionary = readDict();
int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdKeizai = result[1][0]; // wordId of 経済 in 日本経済新聞
assertEquals("カスタム名詞", dictionary.getPartOfSpeech(wordIdKeizai));
}
@Test
public void testRead() throws IOException {
UserDictionary dictionary = readDict();
assertNotNull(dictionary);
}
}

View File

@ -0,0 +1,6 @@
# Custom segmentation for long entries
日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,テスト名詞
# Custom reading for sumo wrestler
朝青龍,朝青龍,アサショウリュウ,カスタム人名

View File

@ -0,0 +1,34 @@
package org.apache.lucene.analysis.kuromoji.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
public class TestToStringUtil extends LuceneTestCase {
public void testPOS() {
assertEquals("noun-suffix-verbal", ToStringUtil.getPOSTranslation("名詞-接尾-サ変接続"));
}
public void testHepburn() {
assertEquals("majan", ToStringUtil.getRomanization("マージャン"));
assertEquals("uroncha", ToStringUtil.getRomanization("ウーロンチャ"));
assertEquals("chahan", ToStringUtil.getRomanization("チャーハン"));
assertEquals("chashu", ToStringUtil.getRomanization("チャーシュー"));
assertEquals("shumai", ToStringUtil.getRomanization("シューマイ"));
}
}

View File

@ -0,0 +1,371 @@
package org.apache.lucene.analysis.kuromoji.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.analysis.kuromoji.dict.BinaryDictionary;
public abstract class BinaryDictionaryWriter {
protected final Class<? extends BinaryDictionary> implClazz;
protected ByteBuffer buffer;
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
private int[] targetMap = new int[8192];
private int[] targetMapOffsets = new int[8192];
private final List<String> posDict = new ArrayList<String>();
private final Map<String,Integer> posDictLookup = new HashMap<String,Integer>();
private final List<String> inflDict = new ArrayList<String>();
private final Map<String,Integer> inflDictLookup = new HashMap<String,Integer>();
public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
this.implClazz = implClazz;
buffer = ByteBuffer.allocate(size);
}
/**
* put the entry in map
* @return current position of buffer, which will be wordId of next entry
*/
public int put(String[] entry) {
short leftId = Short.parseShort(entry[1]);
short rightId = Short.parseShort(entry[2]);
short wordCost = Short.parseShort(entry[3]);
StringBuilder sb = new StringBuilder();
// build up the POS string
for (int i = 4; i < 8; i++) {
String part = entry[i];
assert part.length() > 0;
if (!"*".equals(part)) {
if (sb.length() > 0) {
sb.append('-');
}
sb.append(part);
}
}
String pos = sb.toString();
Integer posIndex = posDictLookup.get(pos);
if (posIndex == null) {
posIndex = posDict.size();
posDict.add(pos);
posDictLookup.put(pos, posIndex);
assert posDict.size() == posDictLookup.size();
}
sb.setLength(0);
sb.append(CSVUtil.quoteEscape(entry[8]));
sb.append(',');
sb.append(CSVUtil.quoteEscape(entry[9]));
String inflData = sb.toString();
Integer inflIndex = Integer.MAX_VALUE;
int hasInflData;
if ("*,*".equals(inflData)) {
hasInflData = 0; // no inflection data
} else {
hasInflData = 1;
inflIndex = inflDictLookup.get(inflData);
if (inflIndex == null) {
inflIndex = inflDict.size();
inflDict.add(inflData);
inflDictLookup.put(inflData, inflIndex);
assert inflDict.size() == inflDictLookup.size();
}
}
String baseForm = entry[10];
String reading = entry[11];
String pronunciation = entry[12];
// extend buffer if necessary
int left = buffer.remaining();
// worst case: three short, 4 bytes, one vint and features (all as utf-16)
int worstCase = 6 + 4 + 2 + 2*(baseForm.length() + reading.length() + pronunciation.length());
if (worstCase > left) {
ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1));
buffer.flip();
newBuffer.put(buffer);
buffer = newBuffer;
}
buffer.putShort(leftId);
buffer.putShort(rightId);
buffer.putShort(wordCost);
assert posIndex.intValue() < 128;
buffer.put((byte) (posIndex.intValue() << 1 | hasInflData));
int pronunciationIsReading = pronunciation.equals(reading) ? 1 : 0;
if ("*".equals(baseForm) || baseForm.equals(entry[0])) {
buffer.put((byte)pronunciationIsReading); // base form is the same as surface form
} else {
assert baseForm.length() < 128;
buffer.put((byte)(baseForm.length() << 1 | pronunciationIsReading));
for (int i = 0; i < baseForm.length(); i++) {
buffer.putChar(baseForm.charAt(i));
}
}
if (isKatakana(reading)) {
buffer.put((byte) (reading.length() << 1 | 1));
writeKatakana(reading);
} else {
buffer.put((byte) (reading.length() << 1));
for (int i = 0; i < reading.length(); i++) {
buffer.putChar(reading.charAt(i));
}
}
if (pronunciationIsReading == 0) {
if (isKatakana(pronunciation)) {
buffer.put((byte) (pronunciation.length() << 1 | 1));
writeKatakana(pronunciation);
} else {
buffer.put((byte) (pronunciation.length() << 1));
for (int i = 0; i < pronunciation.length(); i++) {
buffer.putChar(pronunciation.charAt(i));
}
}
}
if (hasInflData > 0) {
int key = inflIndex.intValue();
assert key < 32768; // note there are really like 300 of these...
if (key < 128) {
buffer.put((byte) key);
} else {
buffer.put((byte) ((key & 0x7f) | 0x80));
buffer.put((byte) (key >>> 7));
}
}
return buffer.position();
}
private boolean isKatakana(String s) {
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
if (ch < 0x30A0 || ch > 0x30FF) {
return false;
}
}
return true;
}
private void writeKatakana(String s) {
for (int i = 0; i < s.length(); i++) {
buffer.put((byte) (s.charAt(i) - 0x30A0));
}
}
public void addMapping(int sourceId, int wordId) {
assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;
if (sourceId > lastSourceId) {
assert sourceId > lastSourceId : "source ids out of order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId;
targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1);
for (int i = lastSourceId + 1; i <= sourceId; i++) {
targetMapOffsets[i] = targetMapEndOffset;
}
} else {
assert sourceId == lastSourceId;
}
targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1);
targetMap[targetMapEndOffset] = wordId;
targetMapEndOffset++;
lastSourceId = sourceId;
lastWordId = wordId;
}
protected final String getBaseFileName(String baseDir) throws IOException {
return baseDir + File.separator + implClazz.getName().replace('.', File.separatorChar);
}
/**
* Write dictionary in file
* Dictionary format is:
* [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
* @throws IOException
*/
public void write(String baseDir) throws IOException {
final String baseName = getBaseFileName(baseDir);
writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
writeInflDict(baseName + BinaryDictionary.INFLDICT_FILENAME_SUFFIX);
}
// TODO: maybe this int[] should instead be the output to the FST...
protected void writeTargetMap(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
OutputStream os = new FileOutputStream(filename);
try {
os = new BufferedOutputStream(os);
final DataOutput out = new OutputStreamDataOutput(os);
CodecUtil.writeHeader(out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION);
final int numSourceIds = lastSourceId + 1;
out.writeVInt(targetMapEndOffset); // <-- size of main array
out.writeVInt(numSourceIds + 1); // <-- size of offset array (+ 1 more entry)
int prev = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMapEndOffset; ofs++) {
final int val = targetMap[ofs], delta = val - prev;
assert delta >= 0;
if (ofs == targetMapOffsets[sourceId]) {
out.writeVInt((delta << 1) | 0x01);
sourceId++;
} else {
out.writeVInt((delta << 1));
}
prev += delta;
}
assert sourceId == numSourceIds : "sourceId:"+sourceId+" != numSourceIds:"+numSourceIds;
} finally {
os.close();
}
}
protected void writePosDict(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
OutputStream os = new FileOutputStream(filename);
try {
os = new BufferedOutputStream(os);
final DataOutput out = new OutputStreamDataOutput(os);
CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
out.writeVInt(posDict.size());
for (String s : posDict) {
out.writeString(s);
}
} finally {
os.close();
}
}
protected void writeInflDict(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
OutputStream os = new FileOutputStream(filename);
try {
os = new BufferedOutputStream(os);
final DataOutput out = new OutputStreamDataOutput(os);
CodecUtil.writeHeader(out, BinaryDictionary.INFLDICT_HEADER, BinaryDictionary.VERSION);
out.writeVInt(inflDict.size());
for (String s : inflDict) {
String data[] = CSVUtil.parse(s);
assert data.length == 2 : "malformed inflection: " + s;
out.writeString(data[0]);
out.writeString(data[1]);
}
} finally {
os.close();
}
}
protected void writeDictionary(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
final FileOutputStream os = new FileOutputStream(filename);
try {
final DataOutput out = new OutputStreamDataOutput(os);
CodecUtil.writeHeader(out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION);
out.writeVInt(buffer.position());
final WritableByteChannel channel = Channels.newChannel(os);
// Write Buffer
buffer.flip(); // set position to 0, set limit to current position
channel.write(buffer);
assert buffer.remaining() == 0L;
} finally {
os.close();
}
}
// TODO: the below is messy, but makes the dictionary smaller.
// we track frequencies of inflections so the highest-freq ones have smaller indexes.
/** optional: notes inflection seen in the data up front */
public void noteInflection(String entry[]) {
StringBuilder sb = new StringBuilder();
sb.append(CSVUtil.quoteEscape(entry[8]));
sb.append(',');
sb.append(CSVUtil.quoteEscape(entry[9]));
String s = sb.toString();
if ("*,*".equals(s)) {
return; // no inflection data
}
Integer freq = notedInflections.get(s);
if (freq == null) {
freq = 0;
}
notedInflections.put(s, freq+1);
}
/** prepopulates inflection mapping by frequency */
public void finalizeInflections() {
InflectionAndFreq freqs[] = new InflectionAndFreq[notedInflections.size()];
int upto = 0;
for (Map.Entry<String,Integer> e : notedInflections.entrySet()) {
freqs[upto++] = new InflectionAndFreq(e.getKey(), e.getValue());
}
Arrays.sort(freqs, Collections.reverseOrder());
for (int i = 0; i < upto; i++) {
inflDict.add(freqs[i].inflection);
inflDictLookup.put(freqs[i].inflection, i);
}
}
static class InflectionAndFreq implements Comparable<InflectionAndFreq> {
String inflection;
int freq;
InflectionAndFreq(String s, int i) {
this.inflection = s;
this.freq = i;
}
public int compareTo(InflectionAndFreq other) {
int cmp = freq - other.freq;
if (cmp == 0) {
return inflection.compareTo(other.inflection);
} else {
return cmp;
}
}
}
private HashMap<String,Integer> notedInflections = new HashMap<String,Integer>();
}

View File

@ -0,0 +1,95 @@
package org.apache.lucene.analysis.kuromoji.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Arrays;
import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.CodecUtil;
public final class CharacterDefinitionWriter {
private final byte[] characterCategoryMap = new byte[0x10000];
private final boolean[] invokeMap = new boolean[CharacterDefinition.CLASS_COUNT];
private final boolean[] groupMap = new boolean[CharacterDefinition.CLASS_COUNT];
/**
* Constructor for building. TODO: remove write access
*/
public CharacterDefinitionWriter() {
Arrays.fill(characterCategoryMap, CharacterDefinition.DEFAULT);
}
/**
* Put mapping from unicode code point to character class.
*
* @param codePoint
* code point
* @param characterClassName character class name
*/
public void putCharacterCategory(int codePoint, String characterClassName) {
characterClassName = characterClassName.split(" ")[0]; // use first
// category
// class
// Override Nakaguro
if (codePoint == 0x30FB) {
characterClassName = "SYMBOL";
}
characterCategoryMap[codePoint] = CharacterDefinition.lookupCharacterClass(characterClassName);
}
public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
final byte characterClass = CharacterDefinition.lookupCharacterClass(characterClassName);
invokeMap[characterClass] = invoke == 1;
groupMap[characterClass] = group == 1;
// TODO: length def ignored
}
public void write(String baseDir) throws IOException {
String filename = baseDir + File.separator +
CharacterDefinition.class.getName().replace('.', File.separatorChar) + CharacterDefinition.FILENAME_SUFFIX;
new File(filename).getParentFile().mkdirs();
OutputStream os = new FileOutputStream(filename);
try {
os = new BufferedOutputStream(os);
final DataOutput out = new OutputStreamDataOutput(os);
CodecUtil.writeHeader(out, CharacterDefinition.HEADER, CharacterDefinition.VERSION);
out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length);
for (int i = 0; i < CharacterDefinition.CLASS_COUNT; i++) {
final byte b = (byte) (
(invokeMap[i] ? 0x01 : 0x00) |
(groupMap[i] ? 0x02 : 0x00)
);
out.writeByte(b);
}
} finally {
os.close();
}
}
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.analysis.kuromoji.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
public class ConnectionCostsBuilder {
private ConnectionCostsBuilder() {
}
public static ConnectionCostsWriter build(String filename) throws IOException {
FileInputStream inputStream = new FileInputStream(filename);
Charset cs = Charset.forName("US-ASCII");
CharsetDecoder decoder = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
LineNumberReader lineReader = new LineNumberReader(streamReader);
String line = lineReader.readLine();
String[] dimensions = line.split("\\s+");
assert dimensions.length == 2;
int forwardSize = Integer.parseInt(dimensions[0]);
int backwardSize = Integer.parseInt(dimensions[1]);
assert forwardSize > 0 && backwardSize > 0;
ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize);
while ((line = lineReader.readLine()) != null) {
String[] fields = line.split("\\s+");
assert fields.length == 3;
int forwardId = Integer.parseInt(fields[0]);
int backwardId = Integer.parseInt(fields[1]);
int cost = Integer.parseInt(fields[2]);
costs.add(forwardId, backwardId, cost);
}
return costs;
}
}

View File

@ -0,0 +1,76 @@
package org.apache.lucene.analysis.kuromoji.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.CodecUtil;
public final class ConnectionCostsWriter {
private final short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter.
private final int forwardSize;
private final int backwardSize;
/**
* Constructor for building. TODO: remove write access
*/
public ConnectionCostsWriter(int forwardSize, int backwardSize) {
this.forwardSize = forwardSize;
this.backwardSize = backwardSize;
this.costs = new short[backwardSize][forwardSize];
}
public void add(int forwardId, int backwardId, int cost) {
this.costs[backwardId][forwardId] = (short)cost;
}
public void write(String baseDir) throws IOException {
String filename = baseDir + File.separator +
ConnectionCosts.class.getName().replace('.', File.separatorChar) + ConnectionCosts.FILENAME_SUFFIX;
new File(filename).getParentFile().mkdirs();
OutputStream os = new FileOutputStream(filename);
try {
os = new BufferedOutputStream(os);
final DataOutput out = new OutputStreamDataOutput(os);
CodecUtil.writeHeader(out, ConnectionCosts.HEADER, ConnectionCosts.VERSION);
out.writeVInt(forwardSize);
out.writeVInt(backwardSize);
int last = 0;
assert costs.length == backwardSize;
for (short[] a : costs) {
assert a.length == forwardSize;
for (int i = 0; i < a.length; i++) {
int delta = (int)a[i] - last;
out.writeVInt((delta >> 31) ^ (delta << 1));
last = a[i];
}
}
} finally {
os.close();
}
}
}

View File

@ -0,0 +1,85 @@
package org.apache.lucene.analysis.kuromoji.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
public class DictionaryBuilder {
public enum DictionaryFormat { IPADIC, UNIDIC };
private DictionaryBuilder() {
}
public static void build(DictionaryFormat format,
String inputDirname,
String outputDirname,
String encoding,
boolean normalizeEntry) throws IOException {
System.out.println("building tokeninfo dict...");
TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.build(inputDirname);
tokenInfoDictionary.write(outputDirname);
tokenInfoDictionary = null;
tokenInfoBuilder = null;
System.out.println("done");
System.out.print("building unknown word dict...");
UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding);
UnknownDictionaryWriter unkDictionary = unkBuilder.build(inputDirname);
unkDictionary.write(outputDirname);
unkDictionary = null;
unkBuilder = null;
System.out.println("done");
System.out.print("building connection costs...");
ConnectionCostsWriter connectionCosts
= ConnectionCostsBuilder.build(inputDirname + File.separator + "matrix.def");
connectionCosts.write(outputDirname);
System.out.println("done");
}
public static void main(String[] args) throws IOException, ClassNotFoundException {
DictionaryFormat format;
if (args[0].equalsIgnoreCase("ipadic")) {
format = DictionaryFormat.IPADIC;
} else if (args[0].equalsIgnoreCase("unidic")) {
format = DictionaryFormat.UNIDIC;
} else {
System.err.println("Illegal format " + args[0] + " using unidic instead");
format = DictionaryFormat.IPADIC;
}
String inputDirname = args[1];
String outputDirname = args[2];
String inputEncoding = args[3];
boolean normalizeEntries = Boolean.parseBoolean(args[4]);
System.out.println("dictionary builder");
System.out.println("");
System.out.println("dictionary format: " + format);
System.out.println("input directory: " + inputDirname);
System.out.println("output directory: " + outputDirname);
System.out.println("input encoding: " + inputEncoding);
System.out.println("normalize entries: " + normalizeEntries);
System.out.println("");
DictionaryBuilder.build(format, inputDirname, outputDirname, inputEncoding, normalizeEntries);
}
}

View File

@ -0,0 +1,230 @@
package org.apache.lucene.analysis.kuromoji.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.kuromoji.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import com.ibm.icu.text.Normalizer2;
/**
*/
public class TokenInfoDictionaryBuilder {
/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
private int offset = 0;
private String encoding = "euc-jp";
private boolean normalizeEntries = false;
private Normalizer2 normalizer;
private DictionaryFormat format = DictionaryFormat.IPADIC;
public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
this.format = format;
this.encoding = encoding;
this.normalizeEntries = normalizeEntries;
this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
}
public TokenInfoDictionaryWriter build(String dirname) throws IOException {
FilenameFilter filter = new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.endsWith(".csv");
}
};
ArrayList<File> csvFiles = new ArrayList<File>();
for (File file : new File(dirname).listFiles(filter)) {
csvFiles.add(file);
}
Collections.sort(csvFiles);
return buildDictionary(csvFiles);
}
public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
// all lines in the file
System.out.println(" parse...");
List<String[]> lines = new ArrayList<String[]>(400000);
for (File file : csvFiles){
FileInputStream inputStream = new FileInputStream(file);
Charset cs = Charset.forName(encoding);
CharsetDecoder decoder = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
BufferedReader reader = new BufferedReader(streamReader);
String line = null;
while ((line = reader.readLine()) != null) {
String[] entry = CSVUtil.parse(line);
if(entry.length < 13) {
System.out.println("Entry in CSV is not valid: " + line);
continue;
}
String[] formatted = formatEntry(entry);
dictionary.noteInflection(formatted);
lines.add(formatted);
// NFKC normalize dictionary entry
if (normalizeEntries) {
if (normalizer.isNormalized(entry[0])){
continue;
}
String[] normalizedEntry = new String[entry.length];
for (int i = 0; i < entry.length; i++) {
normalizedEntry[i] = normalizer.normalize(entry[i]);
}
formatted = formatEntry(normalizedEntry);
dictionary.noteInflection(formatted);
lines.add(formatted);
}
}
}
dictionary.finalizeInflections();
System.out.println(" sort...");
// sort by term: we sorted the files already and use a stable sort.
Collections.sort(lines, new Comparator<String[]>() {
public int compare(String[] left, String[] right) {
return left[0].compareTo(right[0]);
}
});
System.out.println(" encode...");
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true);
Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRef scratch = new IntsRef();
long ord = -1; // first ord will be 0
String lastValue = null;
// build tokeninfo dictionary
for (String[] entry : lines) {
int next = dictionary.put(entry);
if(next == offset){
System.out.println("Failed to process line: " + Arrays.toString(entry));
continue;
}
String token = entry[0];
if (!token.equals(lastValue)) {
// new word to add to fst
ord++;
lastValue = token;
scratch.grow(token.length());
scratch.length = token.length();
for (int i = 0; i < token.length(); i++) {
scratch.ints[i] = (int) token.charAt(i);
}
fstBuilder.add(scratch, fstOutput.get(ord));
}
dictionary.addMapping((int)ord, offset);
offset = next;
}
FST<Long> fst = fstBuilder.finish();
System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... ");
dictionary.setFST(fst);
System.out.println(" done");
return dictionary;
}
/*
* IPADIC features
*
* 0 - surface
* 1 - left cost
* 2 - right cost
* 3 - word cost
* 4-9 - pos
* 10 - base form
* 11 - reading
* 12 - pronounciation
*
* UniDic features
*
* 0 - surface
* 1 - left cost
* 2 - right cost
* 3 - word cost
* 4-9 - pos
* 10 - base form reading
* 11 - base form
* 12 - surface form
* 13 - surface reading
*/
public String[] formatEntry(String[] features) {
if (this.format == DictionaryFormat.IPADIC) {
return features;
} else {
String[] features2 = new String[13];
features2[0] = features[0];
features2[1] = features[1];
features2[2] = features[2];
features2[3] = features[3];
features2[4] = features[4];
features2[5] = features[5];
features2[6] = features[6];
features2[7] = features[7];
features2[8] = features[8];
features2[9] = features[9];
features2[10] = features[11];
// If the surface reading is non-existent, use surface form for reading and pronunciation.
// This happens with punctuation in UniDic and there are possibly other cases as well
if (features[13].length() == 0) {
features2[11] = features[0];
features2[12] = features[0];
} else {
features2[11] = features[13];
features2[12] = features[13];
}
return features2;
}
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.kuromoji.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
import org.apache.lucene.util.fst.FST;
public class TokenInfoDictionaryWriter extends BinaryDictionaryWriter {
private FST<Long> fst;
public TokenInfoDictionaryWriter(int size) {
super(TokenInfoDictionary.class, size);
}
public void setFST(FST<Long> fst) {
this.fst = fst;
}
@Override
public void write(String baseDir) throws IOException {
super.write(baseDir);
writeFST(getBaseFileName(baseDir) + TokenInfoDictionary.FST_FILENAME_SUFFIX);
}
protected void writeFST(String filename) throws IOException {
File f = new File(filename);
f.getParentFile().mkdirs();
fst.save(f);
}
}

View File

@ -0,0 +1,137 @@
package org.apache.lucene.analysis.kuromoji.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
public class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*,*,*";
private String encoding = "euc-jp";
public UnknownDictionaryBuilder(String encoding) {
this.encoding = encoding;
}
public UnknownDictionaryWriter build(String dirname) throws IOException {
UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def"); //Should be only one file
readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
return unkDictionary;
}
public UnknownDictionaryWriter readDictionaryFile(String filename)
throws IOException {
return readDictionaryFile(filename, encoding);
}
public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding)
throws IOException {
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
FileInputStream inputStream = new FileInputStream(filename);
Charset cs = Charset.forName(encoding);
CharsetDecoder decoder = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
LineNumberReader lineReader = new LineNumberReader(streamReader);
dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
List<String[]> lines = new ArrayList<String[]>();
String line = null;
while ((line = lineReader.readLine()) != null) {
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
// even though the unknown dictionary returns hardcoded null here.
final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
lines.add(parsed);
dictionary.noteInflection(parsed); // for completeness; I think unk.def has no inflections...
}
dictionary.finalizeInflections(); // should also be no-op
Collections.sort(lines, new Comparator<String[]>() {
public int compare(String[] left, String[] right) {
int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
return leftId - rightId;
}
});
for (String[] entry : lines) {
dictionary.put(entry);
}
return dictionary;
}
public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException {
FileInputStream inputStream = new FileInputStream(filename);
InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
LineNumberReader lineReader = new LineNumberReader(streamReader);
String line = null;
while ((line = lineReader.readLine()) != null) {
line = line.replaceAll("^\\s", "");
line = line.replaceAll("\\s*#.*", "");
line = line.replaceAll("\\s+", " ");
// Skip empty line or comment line
if(line.length() == 0) {
continue;
}
if(line.startsWith("0x")) { // Category mapping
String[] values = line.split(" ", 2); // Split only first space
if(!values[0].contains("..")) {
int cp = Integer.decode(values[0]).intValue();
dictionary.putCharacterCategory(cp, values[1]);
} else {
String[] codePoints = values[0].split("\\.\\.");
int cpFrom = Integer.decode(codePoints[0]).intValue();
int cpTo = Integer.decode(codePoints[1]).intValue();
for(int i = cpFrom; i <= cpTo; i++){
dictionary.putCharacterCategory(i, values[1]);
}
}
} else { // Invoke definition
String[] values = line.split(" "); // Consecutive space is merged above
String characterClassName = values[0];
int invoke = Integer.parseInt(values[1]);
int group = Integer.parseInt(values[2]);
int length = Integer.parseInt(values[3]);
dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
}
}
}
}

View File

@ -0,0 +1,50 @@
package org.apache.lucene.analysis.kuromoji.util;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
import org.apache.lucene.analysis.kuromoji.dict.BinaryDictionary;
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
public class UnknownDictionaryWriter extends BinaryDictionaryWriter {
private final CharacterDefinitionWriter characterDefinition = new CharacterDefinitionWriter();
public UnknownDictionaryWriter(int size) {
super(UnknownDictionary.class, size);
}
@Override
public int put(String[] entry) {
// Get wordId of current entry
int wordId = buffer.position();
// Put entry
int result = super.put(entry);
// Put entry in targetMap
int characterId = CharacterDefinition.lookupCharacterClass(entry[0]);
addMapping(characterId, wordId);
return result;
}
/**
* Put mapping from unicode code point to character class.
*
* @param codePoint code point
* @param characterClassName character class name
*/
public void putCharacterCategory(int codePoint, String characterClassName) {
characterDefinition.putCharacterCategory(codePoint, characterClassName);
}
public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) {
characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length);
}
@Override
public void write(String baseDir) throws IOException {
super.write(baseDir);
characterDefinition.write(baseDir);
}
}

View File

@ -0,0 +1,75 @@
package org.apache.lucene.analysis.kuromoji.dict;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
import org.apache.lucene.analysis.kuromoji.util.UnknownDictionaryWriter;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class UnknownDictionaryTest extends LuceneTestCase {
public static final String FILENAME = "unk-tokeninfo-dict.obj";
@Test
public void testPutCharacterCategory() {
UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
try{
unkDic.putCharacterCategory(0, "DUMMY_NAME");
fail();
} catch(Exception e) {
}
try{
unkDic.putCharacterCategory(-1, "KATAKANA");
fail();
} catch(Exception e) {
}
unkDic.putCharacterCategory(0, "DEFAULT");
unkDic.putCharacterCategory(1, "GREEK");
unkDic.putCharacterCategory(2, "HIRAGANA");
unkDic.putCharacterCategory(3, "KATAKANA");
unkDic.putCharacterCategory(4, "KANJI");
}
@Test
public void testPut() {
UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
try{
unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*"));
fail();
} catch(Exception e){
}
String entry1 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*,*,*";
String entry2 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*,*,*";
String entry3 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*,*,*";
unkDic.putCharacterCategory(0, "ALPHA");
unkDic.putCharacterCategory(1, "HIRAGANA");
unkDic.putCharacterCategory(2, "KANJI");
unkDic.put(CSVUtil.parse(entry1));
unkDic.put(CSVUtil.parse(entry2));
unkDic.put(CSVUtil.parse(entry3));
}
}

View File

@ -17,6 +17,9 @@ $Id$
the Solr 3.x ICUCollationKeyFilterFactory, and also supports
Locale-sensitive range queries. (rmuir)
* LUCENE-3305: Added Kuromoji morphological analyzer for Japanese.
(Christian Moen, Masaru Hasegawa via Robert Muir)
================== 3.6.0 ==================
* SOLR-2919: Added parametric tailoring options to ICUCollationKeyFilterFactory.

View File

@ -2,15 +2,16 @@ The analysis-extras plugin provides additional analyzers that rely
upon large dependencies/dictionaries.
It includes integration with ICU for multilingual support, and
analyzers for Chinese and Polish.
analyzers for Chinese, Japanese, and Polish.
Relies upon the following lucene components (in lucene-libs/):
* lucene-analyzers-icu-X.Y.jar
* lucene-analyzers-kuromoji-X.Y.jar
* lucene-analyzers-smartcn-X.Y.jar
* lucene-analyzers-stempel-X.Y.jar
And the ICU library (in lib/):
* icu4j-X.Y.jar

View File

@ -27,21 +27,23 @@
<path id="classpath">
<pathelement path="${analyzers-icu.jar}"/>
<pathelement path="${analyzers-kuromoji.jar}"/>
<pathelement path="${analyzers-smartcn.jar}"/>
<pathelement path="${analyzers-stempel.jar}"/>
<path refid="solr.base.classpath"/>
</path>
<target name="module-jars-to-solr"
depends="jar-analyzers-icu, jar-analyzers-smartcn, jar-analyzers-stempel">
depends="jar-analyzers-icu, jar-analyzers-kuromoji, jar-analyzers-smartcn, jar-analyzers-stempel">
<mkdir dir="${build.dir}/lucene-libs"/>
<copy todir="${build.dir}/lucene-libs" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
<fileset file="${analyzers-icu.jar}"/>
<fileset file="${analyzers-kuromoji.jar}"/>
<fileset file="${analyzers-smartcn.jar}"/>
<fileset file="${analyzers-stempel.jar}"/>
</copy>
</target>
<target name="compile-core" depends="jar-analyzers-icu, jar-analyzers-smartcn, jar-analyzers-stempel, solr-contrib-build.compile-core"/>
<target name="compile-core" depends="jar-analyzers-icu, jar-analyzers-kuromoji, jar-analyzers-smartcn, jar-analyzers-stempel, solr-contrib-build.compile-core"/>
<target name="dist" depends="module-jars-to-solr, common-solr.dist"/>
</project>

View File

@ -0,0 +1,40 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.kuromoji.KuromojiBaseFormFilter;
/**
* Factory for {@link KuromojiBaseFormFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ja" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.KuromojiTokenizerFactory"/&gt;
* &lt;filter class="solr.KuromojiBaseFormFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
*/
public class KuromojiBaseFormFilterFactory extends BaseTokenFilterFactory {
@Override
public TokenStream create(TokenStream input) {
return new KuromojiBaseFormFilter(input);
}
}

View File

@ -0,0 +1,65 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.kuromoji.KuromojiPartOfSpeechStopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
* Factory for {@link KuromojiPartOfSpeechStopFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ja" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.KuromojiTokenizerFactory"/&gt;
* &lt;filter class="solr.KuromojiPartOfSpeechStopFilterFactory"
* tags="stopTags.txt"
* enablePositionIncrements="true"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
*/
public class KuromojiPartOfSpeechStopFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
private boolean enablePositionIncrements;
private Set<String> stopTags;
public void inform(ResourceLoader loader) {
String stopTagFiles = args.get("tags");
enablePositionIncrements = getBoolean("enablePositionIncrements", false);
try {
CharArraySet cas = getWordSet(loader, stopTagFiles, false);
stopTags = new HashSet<String>();
for (Object element : cas) {
char chars[] = (char[]) element;
stopTags.add(new String(chars));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public TokenStream create(TokenStream stream) {
return new KuromojiPartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags);
}
}

View File

@ -0,0 +1,92 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.Locale;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
import org.apache.lucene.analysis.kuromoji.Segmenter;
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.analysis.BaseTokenizerFactory;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.SolrException;
import org.apache.solr.util.plugin.ResourceLoaderAware;
/**
* Factory for {@link KuromojiTokenizer}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ja" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.KuromojiTokenizerFactory"
* mode=NORMAL
* user-dictionary=user.txt
* user-dictionary-encoding=UTF-8
* /&gt;
* &lt;filter class="solr.KuromojiBaseFormFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;
* </pre>
*/
public class KuromojiTokenizerFactory extends BaseTokenizerFactory implements ResourceLoaderAware {
private static final String MODE = "mode";
private static final String USER_DICT_PATH = "user-dictionary";
private static final String USER_DICT_ENCODING = "user-dictionary-encoding";
private Segmenter segmenter;
@Override
public void inform(ResourceLoader loader) {
Mode mode = args.get(MODE) != null ? Mode.valueOf(args.get(MODE).toUpperCase(Locale.ENGLISH)) : Mode.NORMAL;
String userDictionaryPath = args.get(USER_DICT_PATH);
try {
if (userDictionaryPath != null) {
InputStream stream = loader.openResource(userDictionaryPath);
String encoding = args.get(USER_DICT_ENCODING);
if (encoding == null) {
encoding = IOUtils.UTF_8;
}
// note: we could allow for other encodings here as an argument
CharsetDecoder decoder = Charset.forName(encoding).newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
Reader reader = new InputStreamReader(stream, decoder);
this.segmenter = new Segmenter(new UserDictionary(reader), mode);
} else {
this.segmenter = new Segmenter(mode);
}
} catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
}
@Override
public Tokenizer create(Reader input) {
return new KuromojiTokenizer(segmenter, input);
}
}

View File

@ -0,0 +1,46 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
import org.apache.solr.common.ResourceLoader;
class StringMockSolrResourceLoader implements ResourceLoader {
String text;
StringMockSolrResourceLoader(String text) {
this.text = text;
}
public List<String> getLines(String resource) throws IOException {
return Arrays.asList(text.split("\n"));
}
public Object newInstance(String cname, String... subpackages) {
return null;
}
public InputStream openResource(String resource) throws IOException {
return new ByteArrayInputStream(text.getBytes("UTF-8"));
}
}

View File

@ -17,18 +17,14 @@ package org.apache.solr.analysis;
* limitations under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.solr.common.ResourceLoader;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RuleBasedCollator;
@ -263,26 +259,6 @@ public class TestICUCollationKeyFilterFactory extends BaseTokenTestCase {
assertCollatesToSame(tsUmlaut, tsOE);
}
private class StringMockSolrResourceLoader implements ResourceLoader {
String text;
StringMockSolrResourceLoader(String text) {
this.text = text;
}
public List<String> getLines(String resource) throws IOException {
return null;
}
public Object newInstance(String cname, String... subpackages) {
return null;
}
public InputStream openResource(String resource) throws IOException {
return new ByteArrayInputStream(text.getBytes("UTF-8"));
}
}
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
assertCollation(stream1, stream2, 0);
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.core.SolrResourceLoader;
/**
* Simple tests for {@link KuromojiBaseFormFilterFactory}
*/
public class TestKuromojiBaseFormFilterFactory extends BaseTokenTestCase {
public void testBasics() throws IOException {
KuromojiTokenizerFactory tokenizerFactory = new KuromojiTokenizerFactory();
tokenizerFactory.init(DEFAULT_VERSION_PARAM);
tokenizerFactory.inform(new SolrResourceLoader(null, null));
TokenStream ts = tokenizerFactory.create(new StringReader("それはまだ実験段階にあります"));
KuromojiBaseFormFilterFactory factory = new KuromojiBaseFormFilterFactory();
ts = factory.create(ts);
assertTokenStreamContents(ts,
new String[] { "それ", "", "まだ", "実験", "段階", "", "ある", "ます" }
);
}
}

View File

@ -0,0 +1,52 @@
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.core.SolrResourceLoader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Simple tests for {@link KuromojiPartOfSpeechStopFilter}
*/
public class TestKuromojiPartOfSpeechStopFilterFactory extends BaseTokenTestCase {
public void testBasics() throws IOException {
String tags =
"# verb-main:\n" +
"動詞-自立\n";
KuromojiTokenizerFactory tokenizerFactory = new KuromojiTokenizerFactory();
tokenizerFactory.init(DEFAULT_VERSION_PARAM);
tokenizerFactory.inform(new SolrResourceLoader(null, null));
TokenStream ts = tokenizerFactory.create(new StringReader("私は制限スピードを超える。"));
KuromojiPartOfSpeechStopFilterFactory factory = new KuromojiPartOfSpeechStopFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString());
args.put("tags", "stoptags.txt");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(tags));
ts = factory.create(ts);
assertTokenStreamContents(ts,
new String[] { "", "", "制限", "スピード", "" }
);
}
}

View File

@ -0,0 +1,61 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.core.SolrResourceLoader;
/**
* Simple tests for {@link KuromojiTokenizerFactory}
*/
public class TestKuromojiTokenizerFactory extends BaseTokenTestCase {
public void testSimple() throws IOException {
KuromojiTokenizerFactory factory = new KuromojiTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
factory.inform(new SolrResourceLoader(null, null));
TokenStream ts = factory.create(new StringReader("これは本ではない"));
assertTokenStreamContents(ts,
new String[] { "これ", "", "", "", "", "ない" },
new int[] { 0, 2, 3, 4, 5, 6 },
new int[] { 2, 3, 4, 5, 6, 8 }
);
}
public void testUserDict() throws IOException {
String userDict =
"# Custom segmentation for long entries\n" +
"日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞\n" +
"関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,テスト名詞\n" +
"# Custom reading for sumo wrestler\n" +
"朝青龍,朝青龍,アサショウリュウ,カスタム人名\n";
KuromojiTokenizerFactory factory = new KuromojiTokenizerFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("user-dictionary", "userdict.txt");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(userDict));
TokenStream ts = factory.create(new StringReader("関西国際空港に行った"));
assertTokenStreamContents(ts,
new String[] { "関西", "国際", "空港", "", "行っ", "" }
);
}
}