Kuromoji now produces both compound words and the segmentation of those words in search mode (LUCENE-3767)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1296805 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christian Moen 2012-03-04 13:34:13 +00:00
parent ff0650ffa4
commit 430365f7cc
31 changed files with 2597 additions and 1329 deletions

View File

@ -154,6 +154,9 @@ New Features
* LUCENE-3730: Refine Kuromoji search mode (Mode.SEARCH) decompounding
heuristics. (Christian Moen via Robert Muir)
* LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words
and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless via Christian Moen)
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
joins in both parent to child and child to parent directions.

View File

@ -52,9 +52,10 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
* @param positionIncrement the distance from the prior term
*/
public void setPositionIncrement(int positionIncrement) {
if (positionIncrement < 0)
if (positionIncrement < 0) {
throw new IllegalArgumentException
("Increment must be zero or greater: " + positionIncrement);
("Increment must be zero or greater: got " + positionIncrement);
}
this.positionIncrement = positionIncrement;
}
@ -77,7 +78,8 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
}
if (other instanceof PositionIncrementAttributeImpl) {
return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement;
PositionIncrementAttributeImpl _other = (PositionIncrementAttributeImpl) other;
return positionIncrement == _other.positionIncrement;
}
return false;
@ -93,5 +95,4 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
t.setPositionIncrement(positionIncrement);
}
}

View File

@ -0,0 +1,41 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
/** The positionLength determines how many positions this
* token spans. Very few analyzer components actually
* produce this attribute, and indexing ignores it, but
* it's useful to express the graph structure naturally
* produced by decompounding, word splitting/joining,
* synonym filtering, etc.
*
* <p>The default value is one. */
public interface PositionLengthAttribute extends Attribute {
/** @param positionLength how many positions this token
* spans. */
public void setPositionLength(int positionLength);
/** Returns the position length of this Token.
* @see #setPositionLength
*/
public int getPositionLength();
}

View File

@ -0,0 +1,74 @@
package org.apache.lucene.analysis.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.AttributeImpl;
/** See {@link PositionLengthAttribute}. */
public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable {
private int positionLength = 1;
/** @param positionLength how many positions this token
* spans. NOTE: this is optional, and most analyzers
* don't change the default value (1). */
public void setPositionLength(int positionLength) {
if (positionLength < 1) {
throw new IllegalArgumentException
("Position length must be 1 or greater: got " + positionLength);
}
this.positionLength = positionLength;
}
/** Returns the position length of this Token.
* @see #setPositionLength
*/
public int getPositionLength() {
return positionLength;
}
@Override
public void clear() {
this.positionLength = 1;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (other instanceof PositionLengthAttributeImpl) {
PositionLengthAttributeImpl _other = (PositionLengthAttributeImpl) other;
return positionLength == _other.positionLength;
}
return false;
}
@Override
public int hashCode() {
return positionLength;
}
@Override
public void copyTo(AttributeImpl target) {
PositionLengthAttribute t = (PositionLengthAttribute) target;
t.setPositionLength(positionLength);
}
}

View File

@ -0,0 +1,148 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
/** Acts like a forever growing char[] as you read
* characters into it from the provided reader, but
* internally it uses a circular buffer to only hold the
* characters that haven't been freed yet. This is like a
* PushbackReader, except you don't have to specify
* up-front the max size of the buffer, but you do have to
* periodically call {@link #freeBefore}. */
public final class RollingCharBuffer {
private Reader reader;
private char[] buffer = new char[32];
// Next array index to write to in buffer:
private int nextWrite;
// Next absolute position to read from reader:
private int nextPos;
// How many valid chars (wrapped) are in the buffer:
private int count;
// True if we hit EOF
private boolean end;
/** Clear array and switch to new reader. */
public void reset(Reader reader) {
this.reader = reader;
nextPos = 0;
nextWrite = 0;
count = 0;
end = false;
}
/* Absolute position read. NOTE: pos must not jump
* ahead by more than 1! Ie, it's OK to read arbitarily
* far back (just not prior to the last {@link
* #freeBefore}), but NOT ok to read arbitrarily far
* ahead. Returns -1 if you hit EOF. */
public int get(int pos) throws IOException {
//System.out.println(" get pos=" + pos + " nextPos=" + nextPos + " count=" + count);
if (pos == nextPos) {
if (end) {
return -1;
}
final int ch = reader.read();
if (ch == -1) {
end = true;
return -1;
}
if (count == buffer.length) {
// Grow
final char[] newBuffer = new char[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_CHAR)];
//System.out.println(Thread.currentThread().getName() + ": cb grow " + newBuffer.length);
System.arraycopy(buffer, nextWrite, newBuffer, 0, buffer.length - nextWrite);
System.arraycopy(buffer, 0, newBuffer, buffer.length - nextWrite, nextWrite);
nextWrite = buffer.length;
buffer = newBuffer;
}
if (nextWrite == buffer.length) {
nextWrite = 0;
}
buffer[nextWrite++] = (char) ch;
count++;
nextPos++;
return ch;
} else {
// Cannot read from future (except by 1):
assert pos < nextPos;
// Cannot read from already freed past:
assert nextPos - pos <= count;
final int index = getIndex(pos);
return buffer[index];
}
}
// For assert:
private boolean inBounds(int pos) {
return pos >= 0 && pos < nextPos && pos >= nextPos - count;
}
private int getIndex(int pos) {
int index = nextWrite - (nextPos - pos);
if (index < 0) {
// Wrap:
index += buffer.length;
assert index >= 0;
}
return index;
}
public char[] get(int posStart, int length) {
assert length > 0;
assert inBounds(posStart): "posStart=" + posStart + " length=" + length;
//System.out.println(" buffer.get posStart=" + posStart + " len=" + length);
final int startIndex = getIndex(posStart);
final int endIndex = getIndex(posStart + length);
//System.out.println(" startIndex=" + startIndex + " endIndex=" + endIndex);
final char[] result = new char[length];
if (endIndex >= startIndex && length < buffer.length) {
System.arraycopy(buffer, startIndex, result, 0, endIndex-startIndex);
} else {
// Wrapped:
final int part1 = buffer.length-startIndex;
System.arraycopy(buffer, startIndex, result, 0, part1);
System.arraycopy(buffer, 0, result, buffer.length-startIndex, length-part1);
}
return result;
}
/** Call this to notify us that no chars before this
* absolute position are needed anymore. */
public void freeBefore(int pos) {
assert pos >= 0;
assert pos <= nextPos;
final int newCount = nextPos - pos;
assert newCount <= count: "newCount=" + newCount + " count=" + count;
assert newCount <= buffer.length: "newCount=" + newCount + " buf.length=" + buffer.length;
count = newCount;
}
}

View File

@ -840,6 +840,7 @@ public final class FST<T> {
}
public Arc<T> readFirstRealTargetArc(int node, Arc<T> arc, final BytesReader in) throws IOException {
assert in.bytes == bytes;
final int address = getNodeAddress(node);
in.pos = address;
//System.out.println(" readFirstRealTargtArc address="
@ -936,6 +937,7 @@ public final class FST<T> {
/** Never returns null, but you should never call this if
* arc.isLast() is true. */
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
assert in.bytes == bytes;
// TODO: can't assert this because we call from readFirstArc
// assert !flag(arc.flags, BIT_LAST_ARC);
@ -1019,6 +1021,7 @@ public final class FST<T> {
* This returns null if the arc was not found, else the incoming arc. */
public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
assert cachedRootArcs != null;
assert in.bytes == bytes;
if (labelToMatch == END_LABEL) {
if (follow.isFinal()) {
@ -1225,17 +1228,20 @@ public final class FST<T> {
/** Expert */
public static abstract class BytesReader extends DataInput {
int pos;
protected int pos;
protected final byte[] bytes;
protected BytesReader(byte[] bytes, int pos) {
this.bytes = bytes;
this.pos = pos;
}
abstract void skip(int byteCount);
abstract void skip(int base, int byteCount);
}
final static class ReverseBytesReader extends BytesReader {
final byte[] bytes;
public ReverseBytesReader(byte[] bytes, int pos) {
this.bytes = bytes;
this.pos = pos;
super(bytes, pos);
}
@Override
@ -1262,11 +1268,9 @@ public final class FST<T> {
// TODO: can we use just ByteArrayDataInput...? need to
// add a .skipBytes to DataInput.. hmm and .setPosition
final static class ForwardBytesReader extends BytesReader {
final byte[] bytes;
public ForwardBytesReader(byte[] bytes, int pos) {
this.bytes = bytes;
this.pos = pos;
super(bytes, pos);
}
@Override

View File

@ -29,6 +29,8 @@ public class TestSimpleAttributeImpl extends LuceneTestCase {
public void testAttributes() {
_TestUtil.assertAttributeReflection(new PositionIncrementAttributeImpl(),
Collections.singletonMap(PositionIncrementAttribute.class.getName()+"#positionIncrement", 1));
_TestUtil.assertAttributeReflection(new PositionLengthAttributeImpl(),
Collections.singletonMap(PositionLengthAttribute.class.getName()+"#positionLength", 1));
_TestUtil.assertAttributeReflection(new FlagsAttributeImpl(),
Collections.singletonMap(FlagsAttribute.class.getName()+"#flags", 0));
_TestUtil.assertAttributeReflection(new TypeAttributeImpl(),

View File

@ -0,0 +1,94 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.StringReader;
public class TestRollingCharBuffer extends LuceneTestCase {
public void test() throws Exception {
final int ITERS = atLeast(1000);
RollingCharBuffer buffer = new RollingCharBuffer();
for(int iter=0;iter<ITERS;iter++) {
final int stringLen = random.nextBoolean() ? random.nextInt(50) : random.nextInt(20000);
final String s;
if (stringLen == 0) {
s = "";
} else {
s = _TestUtil.randomUnicodeString(random, stringLen);
}
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " s.length()=" + s.length());
}
buffer.reset(new StringReader(s));
int nextRead = 0;
int availCount = 0;
while(nextRead < s.length()) {
if (VERBOSE) {
System.out.println(" cycle nextRead=" + nextRead + " avail=" + availCount);
}
if (availCount == 0 || random.nextBoolean()) {
// Read next char
if (VERBOSE) {
System.out.println(" new char");
}
assertEquals(s.charAt(nextRead), buffer.get(nextRead));
nextRead++;
availCount++;
} else if (random.nextBoolean()) {
// Read previous char
int pos = _TestUtil.nextInt(random, nextRead-availCount, nextRead-1);
if (VERBOSE) {
System.out.println(" old char pos=" + pos);
}
assertEquals(s.charAt(pos), buffer.get(pos));
} else {
// Read slice
int length;
if (availCount == 1) {
length = 1;
} else {
length = _TestUtil.nextInt(random, 1, availCount);
}
int start;
if (length == availCount) {
start = nextRead - availCount;
} else {
start = nextRead - availCount + random.nextInt(availCount-length);
}
if (VERBOSE) {
System.out.println(" slice start=" + start + " length=" + length);
}
assertEquals(s.substring(start, start+length),
new String(buffer.get(start, length)));
}
if (availCount > 0 && random.nextInt(20) == 17) {
final int toFree = random.nextInt(availCount);
if (VERBOSE) {
System.out.println(" free " + toFree + " (avail=" + (availCount-toFree) + ")");
}
buffer.freeBefore(nextRead-(availCount-toFree));
availCount -= toFree;
}
}
}
}
}

View File

@ -17,13 +17,18 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
@ -83,7 +88,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
assertNotNull(output);
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
@ -107,6 +112,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
}
PositionLengthAttribute posLengthAtt = null;
if (posLengths != null) {
assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
}
ts.reset();
for (int i = 0; i < output.length; i++) {
@ -116,6 +127,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
if (typeAtt != null) typeAtt.setType("bogusType");
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
assertTrue("token "+i+" does not exist", ts.incrementToken());
@ -130,6 +142,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertEquals("type "+i, types[i], typeAtt.type());
if (posIncrements != null)
assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
if (posLengths != null)
assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength());
// we can enforce some basic things about a few attributes even if the caller doesn't check:
if (offsetAtt != null) {
@ -138,14 +152,18 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset());
if (finalOffset != null) {
assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
assertTrue("endOffset must be <= finalOffset", offsetAtt.endOffset() <= finalOffset.intValue());
assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(),
offsetAtt.endOffset() <= finalOffset.intValue());
}
}
if (posIncrAtt != null) {
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
}
if (posLengthAtt != null) {
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
}
}
assertFalse("end of stream", ts.incrementToken());
assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
ts.end();
if (finalOffset != null)
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
@ -155,65 +173,81 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
ts.close();
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null);
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
assertTokenStreamContents(ts, output, null, null, null, null, null);
assertTokenStreamContents(ts, output, null, null, null, null, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
assertTokenStreamContents(ts, output, null, null, types, null, null);
assertTokenStreamContents(ts, output, null, null, types, null, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
assertTokenStreamContents(ts, output, null, null, null, posIncrements, null);
assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null);
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, finalOffset);
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null);
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, finalOffset);
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, null);
assertAnalyzesTo(a, input, output, null, null, null, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException {
assertAnalyzesTo(a, input, output, null, null, types, null);
assertAnalyzesTo(a, input, output, null, null, types, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null);
}
public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements);
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null);
}
public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
}
public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException {
@ -326,7 +360,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
if (VERBOSE) {
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
}
int remainder = random.nextInt(10);
@ -336,10 +370,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
List<String> tokens = new ArrayList<String>();
List<String> types = new ArrayList<String>();
List<Integer> positions = new ArrayList<Integer>();
List<Integer> positionLengths = new ArrayList<Integer>();
List<Integer> startOffsets = new ArrayList<Integer>();
List<Integer> endOffsets = new ArrayList<Integer>();
ts.reset();
@ -347,6 +383,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
tokens.add(termAtt.toString());
if (typeAtt != null) types.add(typeAtt.type());
if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
if (offsetAtt != null) {
startOffsets.add(offsetAtt.startOffset());
endOffsets.add(offsetAtt.endOffset());
@ -357,11 +394,21 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
if (!tokens.isEmpty()) {
if (VERBOSE) {
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
}
reader = new StringReader(text);
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
toIntArray(positionLengths),
text.length());
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
@ -369,7 +416,18 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
null,
text.length());
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
toIntArray(positions),
toIntArray(positionLengths),
text.length());
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertTokenStreamContents(ts,
@ -378,6 +436,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(endOffsets),
null,
toIntArray(positions),
null,
text.length());
} else if (offsetAtt != null) {
// offset
@ -387,6 +446,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
toIntArray(endOffsets),
null,
null,
null,
text.length());
} else {
// terms only
@ -396,6 +456,22 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
}
}
}
protected String toDot(Analyzer a, String inputText) throws IOException {
final StringWriter sw = new StringWriter();
final TokenStream ts = a.tokenStream("field", new StringReader(inputText));
ts.reset();
new TokenStreamToDot(inputText, ts, new PrintWriter(sw)).toDot();
return sw.toString();
}
protected void toDotFile(Analyzer a, String inputText, String localFileName) throws IOException {
Writer w = new OutputStreamWriter(new FileOutputStream(localFileName), "UTF-8");
final TokenStream ts = a.tokenStream("field", new StringReader(inputText));
ts.reset();
new TokenStreamToDot(inputText, ts, new PrintWriter(w)).toDot();
w.close();
}
static int[] toIntArray(List<Integer> list) {
int ret[] = new int[list.size()];

View File

@ -0,0 +1,159 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.PrintWriter;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
/** Consumes a TokenStream and outputs the dot (graphviz) string (graph). */
public class TokenStreamToDot {
private final TokenStream in;
private final CharTermAttribute termAtt;
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLengthAtt;
private final OffsetAttribute offsetAtt;
private final String inputText;
protected final PrintWriter out;
/** If inputText is non-null, and the TokenStream has
* offsets, we include the surface form in each arc's
* label. */
public TokenStreamToDot(String inputText, TokenStream in, PrintWriter out) {
this.in = in;
this.out = out;
this.inputText = inputText;
termAtt = in.addAttribute(CharTermAttribute.class);
posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
if (in.hasAttribute(OffsetAttribute.class)) {
offsetAtt = in.addAttribute(OffsetAttribute.class);
} else {
offsetAtt = null;
}
}
public void toDot() throws IOException {
in.reset();
writeHeader();
// TODO: is there some way to tell dot that it should
// make the "main path" a straight line and have the
// non-sausage arcs not affect node placement...
int pos = -1;
int lastEndPos = -1;
while (in.incrementToken()) {
final boolean isFirst = pos == -1;
int posInc = posIncAtt.getPositionIncrement();
if (isFirst && posInc == 0) {
// TODO: hmm are TS's still allowed to do this...?
System.err.println("WARNING: first posInc was 0; correcting to 1");
posInc = 1;
}
if (posInc > 0) {
// New node:
pos += posInc;
writeNode(pos, Integer.toString(pos));
}
if (posInc > 1) {
// Gap!
writeArc(lastEndPos, pos, null, "dotted");
}
if (isFirst) {
writeNode(-1, null);
writeArc(-1, pos, null, null);
}
String arcLabel = termAtt.toString();
if (offsetAtt != null) {
final int startOffset = offsetAtt.startOffset();
final int endOffset = offsetAtt.endOffset();
//System.out.println("start=" + startOffset + " end=" + endOffset + " len=" + inputText.length());
if (inputText != null) {
arcLabel += " / " + inputText.substring(startOffset, endOffset);
} else {
arcLabel += " / " + startOffset + "-" + endOffset;
}
}
writeArc(pos, pos + posLengthAtt.getPositionLength(), arcLabel, null);
lastEndPos = pos + posLengthAtt.getPositionLength();
}
in.end();
if (lastEndPos != -1) {
// TODO: should we output any final text (from end
// offsets) on this arc...?
writeNode(-2, null);
writeArc(lastEndPos, -2, null, null);
}
writeTrailer();
}
protected void writeArc(int fromNode, int toNode, String label, String style) {
out.print(" " + fromNode + " -> " + toNode + " [");
if (label != null) {
out.print(" label=\"" + label + "\"");
}
if (style != null) {
out.print(" style=\"" + style + "\"");
}
out.println("]");
}
protected void writeNode(int name, String label) {
out.print(" " + name);
if (label != null) {
out.print(" [label=\"" + label + "\"]");
} else {
out.print(" [shape=point color=white]");
}
out.println();
}
private final static String FONT_NAME = "Helvetica";
/** Override to customize. */
protected void writeHeader() {
out.println("digraph tokens {");
out.println(" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];");
out.println(" // A2 paper size");
out.println(" size = \"34.4,16.5\";");
//out.println(" // try to fill paper");
//out.println(" ratio = fill;");
out.println(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]");
out.println(" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]");
out.println();
}
/** Override to customize. */
protected void writeTrailer() {
out.println("}");
}
}

View File

@ -112,6 +112,8 @@ public final class SynonymFilter extends TokenFilter {
private int captureCount;
// TODO: we should set PositionLengthAttr too...
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

View File

@ -0,0 +1,180 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Position;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Type;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.WrappedPositionArray;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
// TODO: would be nice to show 2nd best path in a diff't
// color...
public class GraphvizFormatter {
private final static String BOS_LABEL = "BOS";
private final static String EOS_LABEL = "EOS";
private final static String FONT_NAME = "Helvetica";
private final ConnectionCosts costs;
private final Map<String, String> bestPathMap;
private final StringBuilder sb = new StringBuilder();
public GraphvizFormatter(ConnectionCosts costs) {
this.costs = costs;
this.bestPathMap = new HashMap<String, String>();
sb.append(formatHeader());
sb.append(" init [style=invis]\n");
sb.append(" init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
}
public String finish() {
sb.append(formatTrailer());
return sb.toString();
}
// Backtraces another incremental fragment:
void onBacktrace(KuromojiTokenizer tok, WrappedPositionArray positions, int lastBackTracePos, Position endPosData, int fromIDX, char[] fragment, boolean isEnd) {
setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
if (isEnd) {
sb.append(" fini [style=invis]\n");
sb.append(" ");
sb.append(getNodeID(endPosData.pos, fromIDX));
sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
}
}
// Records which arcs make up the best bath:
private void setBestPathMap(WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
bestPathMap.clear();
int pos = endPosData.pos;
int bestIDX = fromIDX;
while (pos > startPos) {
final Position posData = positions.get(pos);
final int backPos = posData.backPos[bestIDX];
final int backIDX = posData.backIndex[bestIDX];
final String toNodeID = getNodeID(pos, bestIDX);
final String fromNodeID = getNodeID(backPos, backIDX);
assert !bestPathMap.containsKey(fromNodeID);
assert !bestPathMap.containsValue(toNodeID);
bestPathMap.put(fromNodeID, toNodeID);
pos = backPos;
bestIDX = backIDX;
}
}
private String formatNodes(KuromojiTokenizer tok, WrappedPositionArray positions, int startPos, Position endPosData, char[] fragment) {
StringBuilder sb = new StringBuilder();
// Output nodes
for (int pos = startPos+1; pos <= endPosData.pos; pos++) {
final Position posData = positions.get(pos);
for(int idx=0;idx<posData.count;idx++) {
sb.append(" ");
sb.append(getNodeID(pos, idx));
sb.append(" [label=\"");
sb.append(pos);
sb.append(": ");
sb.append(posData.lastRightID[idx]);
sb.append("\"]\n");
}
}
// Output arcs
for (int pos = endPosData.pos; pos > startPos; pos--) {
final Position posData = positions.get(pos);
for(int idx=0;idx<posData.count;idx++) {
final Position backPosData = positions.get(posData.backPos[idx]);
final String toNodeID = getNodeID(pos, idx);
final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
sb.append(" ");
sb.append(fromNodeID);
sb.append(" -> ");
sb.append(toNodeID);
final String attrs;
if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
// This arc is on best path
attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
} else {
attrs = "";
}
final Dictionary dict = tok.getDict(posData.backType[idx]);
final int wordCost = dict.getWordCost(posData.backID[idx]);
final int bgCost = costs.get(backPosData.lastRightID[posData.backIndex[idx]],
dict.getLeftId(posData.backID[idx]));
final String surfaceForm = new String(fragment,
posData.backPos[idx] - startPos,
pos - posData.backPos[idx]);
sb.append(" [label=\"");
sb.append(surfaceForm);
sb.append(' ');
sb.append(wordCost);
if (bgCost >= 0) {
sb.append('+');
}
sb.append(bgCost);
sb.append("\"");
sb.append(attrs);
sb.append("]\n");
}
}
return sb.toString();
}
private String formatHeader() {
StringBuilder sb = new StringBuilder();
sb.append("digraph viterbi {\n");
sb.append(" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
//sb.append(" // A2 paper size\n");
//sb.append(" size = \"34.4,16.5\";\n");
//sb.append(" // try to fill paper\n");
//sb.append(" ratio = fill;\n");
sb.append(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
sb.append(" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
return sb.toString();
}
private String formatTrailer() {
return "}";
}
private String getNodeID(int pos, int idx) {
return pos + "." + idx;
}
}

View File

@ -27,21 +27,25 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
public class KuromojiAnalyzer extends StopwordAnalyzerBase {
private final Segmenter segmenter;
private final Mode mode;
private final Set<String> stoptags;
private final UserDictionary userDict;
public KuromojiAnalyzer(Version matchVersion) {
this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
this(matchVersion, null, KuromojiTokenizer.DEFAULT_MODE, DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
}
public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, CharArraySet stopwords, Set<String> stoptags) {
public KuromojiAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) {
super(matchVersion, stopwords);
this.segmenter = segmenter;
this.userDict = userDict;
this.mode = mode;
this.stoptags = stoptags;
}
@ -79,7 +83,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
Tokenizer tokenizer = new KuromojiTokenizer(reader, userDict, true, mode);
TokenStream stream = new KuromojiBaseFormFilter(tokenizer);
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
stream = new CJKWidthFilter(stream);

View File

@ -1,214 +0,0 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
/**
* Tokenizer main class.
* Thread safe.
*/
public class Segmenter {
public static enum Mode {
NORMAL, SEARCH, EXTENDED
}
public static final Mode DEFAULT_MODE = Mode.SEARCH;
private final Viterbi viterbi;
private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
private final boolean split;
public Segmenter() {
this(null, DEFAULT_MODE, false);
}
public Segmenter(Mode mode) {
this(null, mode, false);
}
public Segmenter(UserDictionary userDictionary) {
this(userDictionary, DEFAULT_MODE, false);
}
public Segmenter(UserDictionary userDictionary, Mode mode) {
this(userDictionary, mode, false);
}
public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) {
final TokenInfoDictionary dict = TokenInfoDictionary.getInstance();
final UnknownDictionary unknownDict = UnknownDictionary.getInstance();
this.viterbi = new Viterbi(dict, unknownDict, ConnectionCosts.getInstance(), userDictionary, mode);
this.split = split;
dictionaryMap.put(Type.KNOWN, dict);
dictionaryMap.put(Type.UNKNOWN, unknownDict);
dictionaryMap.put(Type.USER, userDictionary);
}
/**
* Tokenize input text
* @param text
* @return list of Token
*/
public List<Token> tokenize(String text) {
if (!split) {
return doTokenize(0, text);
}
List<Integer> splitPositions = getSplitPositions(text);
if(splitPositions.size() == 0) {
return doTokenize(0, text);
}
ArrayList<Token> result = new ArrayList<Token>();
int offset = 0;
for(int position : splitPositions) {
result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
offset = position + 1;
}
if(offset < text.length()) {
result.addAll(doTokenize(offset, text.substring(offset)));
}
return result;
}
/**
* Split input text at 句読点, which is and
* @param text
* @return list of split position
*/
private List<Integer> getSplitPositions(String text) {
ArrayList<Integer> splitPositions = new ArrayList<Integer>();
int position = 0;
int currentPosition = 0;
while(true) {
int indexOfMaru = text.indexOf("", currentPosition);
int indexOfTen = text.indexOf("", currentPosition);
if(indexOfMaru < 0 || indexOfTen < 0) {
position = Math.max(indexOfMaru, indexOfTen);;
} else {
position = Math.min(indexOfMaru, indexOfTen);
}
if(position >= 0) {
splitPositions.add(position);
currentPosition = position + 1;
} else {
break;
}
}
return splitPositions;
}
private List<Token> doTokenize(int offset, String sentence) {
char text[] = sentence.toCharArray();
return doTokenize(offset, text, 0, text.length, false);
}
/**
* Tokenize input sentence.
* @param offset offset of sentence in original input text
* @param sentence sentence to tokenize
* @return list of Token
*/
public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
ArrayList<Token> result = new ArrayList<Token>();
ViterbiNode[][][] lattice;
try {
lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
} catch (IOException impossible) {
throw new RuntimeException(impossible);
}
List<ViterbiNode> bestPath = viterbi.search(lattice);
for (ViterbiNode node : bestPath) {
int wordId = node.getWordId();
if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS
continue;
} else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
continue; // Do not emit punctuation
}
Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
result.add(token);
}
return result;
}
/** returns a Graphviz String */
public String debugTokenize(String text) {
ViterbiNode[][][] lattice;
try {
lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
} catch (IOException impossible) {
throw new RuntimeException(impossible);
}
List<ViterbiNode> bestPath = this.viterbi.search(lattice);
return new GraphvizFormatter(ConnectionCosts.getInstance())
.format(lattice[0], lattice[1], bestPath);
}
static final boolean isPunctuation(char ch) {
switch(Character.getType(ch)) {
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.CONTROL:
case Character.FORMAT:
case Character.DASH_PUNCTUATION:
case Character.START_PUNCTUATION:
case Character.END_PUNCTUATION:
case Character.CONNECTOR_PUNCTUATION:
case Character.OTHER_PUNCTUATION:
case Character.MATH_SYMBOL:
case Character.CURRENCY_SYMBOL:
case Character.MODIFIER_SYMBOL:
case Character.OTHER_SYMBOL:
case Character.INITIAL_QUOTE_PUNCTUATION:
case Character.FINAL_QUOTE_PUNCTUATION:
return true;
default:
return false;
}
}
}

View File

@ -17,8 +17,8 @@ package org.apache.lucene.analysis.kuromoji;
* limitations under the License.
*/
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Type;
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
public class Token {
private final Dictionary dictionary;
@ -30,6 +30,7 @@ public class Token {
private final int length;
private final int position;
private int positionLength;
private final Type type;
@ -40,8 +41,14 @@ public class Token {
this.length = length;
this.type = type;
this.position = position;
this.positionLength = positionLength;
this.dictionary = dictionary;
}
@Override
public String toString() {
return "Token(\"" + new String(surfaceForm, offset, length) + "\" pos=" + position + " type=" + type + " wordId=" + wordId + " leftID=" + dictionary.getLeftId(wordId) + ")";
}
/**
* @return surfaceForm
@ -144,4 +151,21 @@ public class Token {
public int getPosition() {
return position;
}
/**
* Set the position length (in tokens) of this token. For normal
* tokens this is 1; for compound tokens it's > 1.
*/
public void setPositionLength(int positionLength) {
this.positionLength = positionLength;
}
/**
* Get the length (in tokens) of this token. For normal
* tokens this is 1; for compound tokens it's > 1.
* @return position length of token
*/
public int getPositionLength() {
return positionLength;
}
}

View File

@ -27,6 +27,7 @@ import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
@ -159,6 +160,10 @@ public final class UserDictionary implements Dictionary {
return found ? toIndexArray(result) : EMPTY_RESULT;
}
public TokenInfoFST getFST() {
return fst;
}
private static final int[][] EMPTY_RESULT = new int[0][];
/**
@ -181,6 +186,10 @@ public final class UserDictionary implements Dictionary {
}
return result.toArray(new int[result.size()][]);
}
public int[] lookupSegmentation(int phraseID) {
return segmentations[phraseID];
}
@Override
public int getLeftId(int wordId) {

View File

@ -1,226 +0,0 @@
package org.apache.lucene.analysis.kuromoji.viterbi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
public class GraphvizFormatter {
private final static String BOS_LABEL = "BOS";
private final static String EOS_LABEL = "EOS";
private final static String FONT_NAME = "Helvetica";
private ConnectionCosts costs;
private Map<String, ViterbiNode> nodeMap;
private Map<String, String> bestPathMap;
private boolean foundBOS;
public GraphvizFormatter(ConnectionCosts costs) {
this.costs = costs;
this.nodeMap = new HashMap<String, ViterbiNode>();
this.bestPathMap = new HashMap<String, String>();
}
public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
initBestPathMap(null);
StringBuilder sb = new StringBuilder();
sb.append(formatHeader());
sb.append(formatNodes(startsArray, endsArray));
sb.append(formatTrailer());
return sb.toString();
}
public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List<ViterbiNode> bestPath) {
// List<ViterbiNode> bestPathWithBOSAndEOS = new ArrayList<ViterbiNode>(bastPath);
initBestPathMap(bestPath);
StringBuilder sb = new StringBuilder();
sb.append(formatHeader());
sb.append(formatNodes(startsArray, endsArray));
sb.append(formatTrailer());
return sb.toString();
}
private void initBestPathMap(List<ViterbiNode> bestPath) {
this.bestPathMap.clear();
if (bestPath == null){
return;
}
for (int i = 0; i < bestPath.size() - 1; i++) {
ViterbiNode from = bestPath.get(i);
ViterbiNode to = bestPath.get(i + 1);
String fromId = getNodeId(from);
String toId = getNodeId(to);
assert this.bestPathMap.containsKey(fromId) == false;
assert this.bestPathMap.containsValue(toId) == false;
this.bestPathMap.put(fromId, toId);
}
}
private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
this.nodeMap.clear();
this.foundBOS = false;
StringBuilder sb = new StringBuilder();
for (int i = 1; i < endsArray.length; i++) {
if(endsArray[i] == null || startsArray[i] == null) {
continue;
}
for (int j = 0; j < endsArray[i].length; j++) {
ViterbiNode from = endsArray[i][j];
if(from == null){
continue;
}
sb.append(formatNodeIfNew(from));
for (int k = 0; k < startsArray[i].length; k++) {
ViterbiNode to = startsArray[i][k];
if(to == null){
break;
}
sb.append(formatNodeIfNew(to));
sb.append(formatEdge(from, to));
}
}
}
return sb.toString();
}
private String formatNodeIfNew(ViterbiNode node) {
String nodeId = getNodeId(node);
if (! this.nodeMap.containsKey(nodeId)) {
this.nodeMap.put(nodeId, node);
return formatNode(node);
} else {
return "";
}
}
private String formatHeader() {
StringBuilder sb = new StringBuilder();
sb.append("digraph viterbi {\n");
sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n");
sb.append("# A2 paper size\n");
sb.append("size = \"34.4,16.5\";\n");
sb.append("# try to fill paper\n");
sb.append("ratio = fill;\n");
sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
return sb.toString();
}
private String formatTrailer() {
return "}";
}
private String formatEdge(ViterbiNode from, ViterbiNode to) {
if (this.bestPathMap.containsKey(getNodeId(from)) &&
this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) {
return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 ");
} else {
return formatEdge(from, to, "");
}
}
private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) {
StringBuilder sb = new StringBuilder();
sb.append(getNodeId(from));
sb.append(" -> ");
sb.append(getNodeId(to));
sb.append(" [ ");
sb.append("label=\"");
sb.append(getCost(from, to));
sb.append("\"");
sb.append(" ");
sb.append(attributes);
sb.append(" ");
sb.append(" ]");
sb.append("\n");
return sb.toString();
}
private String formatNode(ViterbiNode node) {
StringBuilder sb = new StringBuilder();
sb.append("\"");
sb.append(getNodeId(node));
sb.append("\"");
sb.append(" [ ");
sb.append("label=");
sb.append(formatNodeLabel(node));
sb.append(" ]");
return sb.toString();
}
private String formatNodeLabel(ViterbiNode node) {
StringBuilder sb = new StringBuilder();
sb.append("<<table border=\"0\" cellborder=\"0\">");
sb.append("<tr><td>");
sb.append(getNodeLabel(node));
sb.append("</td></tr>");
sb.append("<tr><td>");
sb.append("<font color=\"blue\">");
sb.append(node.getWordCost());
sb.append("</font>");
sb.append("</td></tr>");
// sb.append("<tr><td>");
// sb.append(this.dictionary.get(node.getWordId()).getPosInfo());
// sb.append("</td></tr>");
sb.append("</table>>");
return sb.toString();
}
private String getNodeId(ViterbiNode node) {
return String.valueOf(node.hashCode());
}
private String getNodeLabel(ViterbiNode node) {
if (node.getType() == Type.KNOWN && node.getWordId() == 0) {
if (this.foundBOS) {
return EOS_LABEL;
} else {
this.foundBOS = true;
return BOS_LABEL;
}
} else {
return node.getSurfaceFormString();
}
}
private int getCost(ViterbiNode from, ViterbiNode to) {
return this.costs.get(from.getLeftId(), to.getRightId());
}
}

View File

@ -1,365 +0,0 @@
package org.apache.lucene.analysis.kuromoji.viterbi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST;
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST;
public class Viterbi {
private final TokenInfoFST fst;
private final TokenInfoDictionary dictionary;
private final UnknownDictionary unkDictionary;
private final ConnectionCosts costs;
private final UserDictionary userDictionary;
private final CharacterDefinition characterDefinition;
private final boolean useUserDictionary;
private final boolean searchMode;
private final boolean extendedMode;
private static final int DEFAULT_COST = 10000000;
private static final int SEARCH_MODE_KANJI_LENGTH = 2;
private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
private static final int SEARCH_MODE_KANJI_PENALTY = 3000;
private static final int SEARCH_MODE_OTHER_PENALTY = 1700;
private static final char[] BOS = "BOS".toCharArray();
private static final char[] EOS = "EOS".toCharArray();
/**
* Constructor
*/
public Viterbi(TokenInfoDictionary dictionary,
UnknownDictionary unkDictionary,
ConnectionCosts costs,
UserDictionary userDictionary,
Mode mode) {
this.dictionary = dictionary;
this.fst = dictionary.getFST();
this.unkDictionary = unkDictionary;
this.costs = costs;
this.userDictionary = userDictionary;
if(userDictionary == null) {
this.useUserDictionary = false;
} else {
this.useUserDictionary = true;
}
switch(mode){
case SEARCH:
searchMode = true;
extendedMode = false;
break;
case EXTENDED:
searchMode = true;
extendedMode = true;
break;
default:
searchMode = false;
extendedMode = false;
break;
}
this.characterDefinition = unkDictionary.getCharacterDefinition();
}
/**
* Find best path from input lattice.
* @param lattice the result of build method
* @return List of ViterbiNode which consist best path
*/
public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
ViterbiNode[][] startIndexArr = lattice[0];
ViterbiNode[][] endIndexArr = lattice[1];
for (int i = 1; i < startIndexArr.length; i++){
if (startIndexArr[i] == null || endIndexArr[i] == null){ // continue since no array which contains ViterbiNodes exists. Or no previous node exists.
continue;
}
for (ViterbiNode node : startIndexArr[i]) {
if (node == null){ // If array doesn't contain ViterbiNode any more, continue to next index
break;
}
int backwardConnectionId = node.getLeftId();
int wordCost = node.getWordCost();
int leastPathCost = DEFAULT_COST;
for (ViterbiNode leftNode : endIndexArr[i]) {
if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
break;
}
int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost; // cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
// "Search mode". Add extra costs if it is long node.
if (searchMode) {
// System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
char[] surfaceForm = node.getSurfaceForm();
int offset = node.getOffset();
int length = node.getLength();
if (length > SEARCH_MODE_KANJI_LENGTH) {
boolean allKanji = true;
// check if node consists of only kanji
for (int pos = 0; pos < length; pos++) {
if (!characterDefinition.isKanji(surfaceForm[offset+pos])){
allKanji = false;
break;
}
}
if (allKanji) { // Process only Kanji keywords
pathCost += (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
} else if (length > SEARCH_MODE_OTHER_LENGTH) {
pathCost += (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
}
}
}
if (pathCost < leastPathCost){ // If total cost is lower than before, set current previous node as best left node (previous means left).
leastPathCost = pathCost;
node.setPathCost(leastPathCost);
node.setLeftNode(leftNode);
}
}
}
}
// track best path
ViterbiNode node = endIndexArr[0][0]; // EOS
LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
result.add(node);
while (true) {
ViterbiNode leftNode = node.getLeftNode();
if (leftNode == null) {
break;
}
// EXTENDED mode convert unknown word into unigram node
if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
byte unigramWordId = CharacterDefinition.NGRAM;
int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
char[] surfaceForm = leftNode.getSurfaceForm();
int offset = leftNode.getOffset();
int length = leftNode.getLength();
for (int i = length - 1; i >= 0; i--) {
int charLen = 1;
if (i > 0 && Character.isLowSurrogate(surfaceForm[offset+i])) {
i--;
charLen = 2;
}
ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm, offset + i, charLen, unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i, Type.UNKNOWN);
result.addFirst(uniGramNode);
}
} else {
result.addFirst(leftNode);
}
node = leftNode;
}
return result;
}
/**
* Build lattice from input text
* @param text
*/
public ViterbiNode[][][] build(char text[], int offset, int length) throws IOException {
ViterbiNode[][] startIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS
ViterbiNode[][] endIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS
int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr
int[] endSizeArr = new int[length + 2]; // array to keep ViterbiNode count in endIndexArr
FST.Arc<Long> arc = new FST.Arc<Long>();
ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
final FST.BytesReader fstReader = fst.getBytesReader(0);
// Process user dictionary;
if (useUserDictionary) {
processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
int unknownWordEndIndex = -1; // index of the last character of unknown word
final IntsRef wordIdRef = new IntsRef();
for (int startIndex = 0; startIndex < length; startIndex++) {
// If no token ends where current token starts, skip this index
if (endSizeArr[startIndex + 1] == 0) {
continue;
}
int suffixStart = offset + startIndex;
int suffixLength = length - startIndex;
boolean found = false;
arc = fst.getFirstArc(arc);
int output = 0;
for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
int ch = text[suffixStart + endIndex - 1];
if (fst.findTargetArc(ch, arc, arc, endIndex == 1, fstReader) == null) {
break; // continue to next position
}
output += arc.output.intValue();
if (arc.isFinal()) {
final int finalOutput = output + arc.nextFinalOutput.intValue();
found = true; // Don't produce unknown word starting from this index
dictionary.lookupWordIds(finalOutput, wordIdRef);
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, endIndex, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
}
}
// In the case of normal mode, it doesn't process unknown word greedily.
if(!searchMode && unknownWordEndIndex > startIndex){
continue;
}
// Process Unknown Word: hmm what is this isInvoke logic (same no matter what)
int unknownWordLength = 0;
char firstCharacter = text[suffixStart];
boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
if (isInvoke){ // Process "invoke"
unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
} else if (found == false){ // Process not "invoke"
unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
}
if (unknownWordLength > 0) { // found unknown word
final int characterId = characterDefinition.getCharacterClass(firstCharacter);
unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, unknownWordLength, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
unknownWordEndIndex = startIndex + unknownWordLength;
}
}
ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
return result;
}
/**
* Find token(s) in input text and set found token(s) in arrays as normal tokens
* @param text
* @param startIndexArr
* @param endIndexArr
* @param startSizeArr
* @param endSizeArr
*/
private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) throws IOException {
int[][] result = userDictionary.lookup(text, offset, len);
for(int[] segmentation : result) {
int wordId = segmentation[0];
int index = segmentation[1];
int length = segmentation[2];
ViterbiNode node = new ViterbiNode(wordId, text, offset + index, length, userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
}
}
/**
* Add node to arrays and increment count in size array
* @param node
* @param startIndex
* @param endIndex
* @param startIndexArr
* @param endIndexArr
* @param startSizeArr
* @param endSizeArr
*/
private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
int startNodesCount = startSizeArr[startIndex];
int endNodesCount = endSizeArr[endIndex];
if (startNodesCount == 0) {
startIndexArr[startIndex] = new ViterbiNode[10];
}
if (endNodesCount == 0) {
endIndexArr[endIndex] = new ViterbiNode[10];
}
if (startIndexArr[startIndex].length <= startNodesCount){
startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
}
if (endIndexArr[endIndex].length <= endNodesCount){
endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
}
startIndexArr[startIndex][startNodesCount] = node;
endIndexArr[endIndex][endNodesCount] = node;
startSizeArr[startIndex] = startNodesCount + 1;
endSizeArr[endIndex] = endNodesCount + 1;
}
/**
* Return twice as big array which contains value of input array
* @param array
* @return
*/
private ViterbiNode[] extendArray(ViterbiNode[] array) {
//extend array
ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
}
}

View File

@ -1,147 +0,0 @@
package org.apache.lucene.analysis.kuromoji.viterbi;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public final class ViterbiNode {
public enum Type {
KNOWN,
UNKNOWN,
USER
}
private final int wordId;
private final char[] surfaceForm;
private final int offset;
private final int length;
private final int leftId;
private final int rightId;
/** word cost for this node */
private final int wordCost;
/** minimum path cost found thus far */
private int pathCost;
private ViterbiNode leftNode;
private final Type type;
private final int startIndex;
public ViterbiNode(int wordId, char[] surfaceForm, int offset, int length, int leftId, int rightId, int wordCost, int startIndex, Type type) {
this.wordId = wordId;
this.surfaceForm = surfaceForm;
this.offset = offset;
this.length = length;
this.leftId = leftId;
this.rightId = rightId;
this.wordCost = wordCost;
this.startIndex = startIndex;
this.type = type;
}
/**
* @return the wordId
*/
public int getWordId() {
return wordId;
}
/**
* @return the surfaceForm
*/
public char[] getSurfaceForm() {
return surfaceForm;
}
/**
* @return start offset into surfaceForm
*/
public int getOffset() {
return offset;
}
/**
* @return length of surfaceForm
*/
public int getLength() {
return length;
}
/**
* @return the surfaceForm as a String
*/
public String getSurfaceFormString() {
return new String(surfaceForm, offset, length);
}
/**
* @return the leftId
*/
public int getLeftId() {
return leftId;
}
/**
* @return the rightId
*/
public int getRightId() {
return rightId;
}
/**
* @return the cost
*/
public int getWordCost() {
return wordCost;
}
/**
* @return the cost
*/
public int getPathCost() {
return pathCost;
}
/**
* param cost minimum path cost found this far
*/
public void setPathCost(int pathCost) {
this.pathCost = pathCost;
}
public void setLeftNode(ViterbiNode node) {
leftNode = node;
}
public ViterbiNode getLeftNode() {
return leftNode;
}
public int getStartIndex() {
return startIndex;
}
public Type getType() {
return type;
}
}

View File

@ -1,231 +0,0 @@
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.List;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
public class SegmenterTest extends LuceneTestCase {
private static Segmenter segmenter;
@BeforeClass
public static void setUpBeforeClass() throws Exception {
segmenter = new Segmenter();
}
@AfterClass
public static void afterClass() throws Exception {
segmenter = null;
}
@Test
public void testSegmentation() {
// Skip tests for Michelle Kwan -- UniDic segments Kwan as ワン
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
// String[] surfaceForms = {
// "ミシェル", "", "クワン", "", "優勝", "", "まし", "", "",
// "スペース", "ステーション", "", "行き", "ます", "",
// "うたがわしい", ""
// };
String input = "スペースステーションに行きます。うたがわしい。";
String[] surfaceForms = {
"スペース", "ステーション", "", "行き", "ます", "",
"うたがわしい", ""
};
List<Token> tokens = segmenter.tokenize(input);
assertTrue(tokens.size() == surfaceForms.length);
for (int i = 0; i < tokens.size(); i++) {
assertEquals(surfaceForms[i], tokens.get(i).getSurfaceFormString());
}
}
@Test
public void testReadings() {
List<Token> tokens = segmenter.tokenize("寿司が食べたいです。");
assertEquals(6, tokens.size());
assertEquals("スシ", tokens.get(0).getReading());
assertEquals("", tokens.get(1).getReading());
assertEquals("タベ", tokens.get(2).getReading());
assertEquals("タイ", tokens.get(3).getReading());
assertEquals("デス", tokens.get(4).getReading());
assertEquals("", tokens.get(5).getReading());
}
@Test
public void testReadings2() {
List<Token> tokens = segmenter.tokenize("多くの学生が試験に落ちた。");
assertEquals(9, tokens.size());
assertEquals("オオク", tokens.get(0).getReading());
assertEquals("", tokens.get(1).getReading());
assertEquals("ガクセイ", tokens.get(2).getReading());
assertEquals("", tokens.get(3).getReading());
assertEquals("シケン", tokens.get(4).getReading());
assertEquals("", tokens.get(5).getReading());
assertEquals("オチ", tokens.get(6).getReading());
assertEquals("", tokens.get(7).getReading());
assertEquals("", tokens.get(8).getReading());
}
@Test
public void testPronunciations() {
List<Token> tokens = segmenter.tokenize("寿司が食べたいです。");
assertEquals(6, tokens.size());
assertEquals("スシ", tokens.get(0).getPronunciation());
assertEquals("", tokens.get(1).getPronunciation());
assertEquals("タベ", tokens.get(2).getPronunciation());
assertEquals("タイ", tokens.get(3).getPronunciation());
assertEquals("デス", tokens.get(4).getPronunciation());
assertEquals("", tokens.get(5).getPronunciation());
}
@Test
public void testPronunciations2() {
List<Token> tokens = segmenter.tokenize("多くの学生が試験に落ちた。");
assertEquals(9, tokens.size());
// pronunciation differs from reading here
assertEquals("オーク", tokens.get(0).getPronunciation());
assertEquals("", tokens.get(1).getPronunciation());
assertEquals("ガクセイ", tokens.get(2).getPronunciation());
assertEquals("", tokens.get(3).getPronunciation());
assertEquals("シケン", tokens.get(4).getPronunciation());
assertEquals("", tokens.get(5).getPronunciation());
assertEquals("オチ", tokens.get(6).getPronunciation());
assertEquals("", tokens.get(7).getPronunciation());
assertEquals("", tokens.get(8).getPronunciation());
}
@Test
public void testBasicForms() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertNull(tokens.get(0).getBaseForm());
assertNull(tokens.get(1).getBaseForm());
assertNull(tokens.get(2).getBaseForm());
assertNull(tokens.get(3).getBaseForm());
assertNull(tokens.get(4).getBaseForm());
assertNull(tokens.get(5).getBaseForm());
assertEquals(tokens.get(6).getBaseForm(), "ある");
assertNull(tokens.get(7).getBaseForm());
assertNull(tokens.get(8).getBaseForm());
}
@Test
public void testInflectionTypes() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertNull(tokens.get(0).getInflectionType());
assertNull(tokens.get(1).getInflectionType());
assertNull(tokens.get(2).getInflectionType());
assertNull(tokens.get(3).getInflectionType());
assertNull(tokens.get(4).getInflectionType());
assertNull(tokens.get(5).getInflectionType());
assertEquals("五段・ラ行", tokens.get(6).getInflectionType());
assertEquals("特殊・マス", tokens.get(7).getInflectionType());
assertNull(tokens.get(8).getInflectionType());
}
@Test
public void testInflectionForms() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertNull(tokens.get(0).getInflectionForm());
assertNull(tokens.get(1).getInflectionForm());
assertNull(tokens.get(2).getInflectionForm());
assertNull(tokens.get(3).getInflectionForm());
assertNull(tokens.get(4).getInflectionForm());
assertNull(tokens.get(5).getInflectionForm());
assertEquals("連用形", tokens.get(6).getInflectionForm());
assertEquals("基本形", tokens.get(7).getInflectionForm());
assertNull(tokens.get(8).getInflectionForm());
}
@Test
public void testPartOfSpeech() {
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
assertEquals(9, tokens.size());
assertEquals("名詞-代名詞-一般", tokens.get(0).getPartOfSpeech());
assertEquals("助詞-係助詞", tokens.get(1).getPartOfSpeech());
assertEquals("副詞-助詞類接続", tokens.get(2).getPartOfSpeech());
assertEquals("名詞-サ変接続", tokens.get(3).getPartOfSpeech());
assertEquals("名詞-一般", tokens.get(4).getPartOfSpeech());
assertEquals("助詞-格助詞-一般", tokens.get(5).getPartOfSpeech());
assertEquals("動詞-自立", tokens.get(6).getPartOfSpeech());
assertEquals("助動詞", tokens.get(7).getPartOfSpeech());
assertEquals("記号-句点", tokens.get(8).getPartOfSpeech());
}
// TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
// do we have a possibility to actually lookup the first and last word from dictionary?
public void testYabottai() {
List<Token> tokens = segmenter.tokenize("やぼったい");
assertEquals(1, tokens.size());
assertEquals("やぼったい", tokens.get(0).getSurfaceFormString());
}
public void testTsukitosha() {
List<Token> tokens = segmenter.tokenize("突き通しゃ");
assertEquals(1, tokens.size());
assertEquals("突き通しゃ", tokens.get(0).getSurfaceFormString());
}
public void testBocchan() throws Exception {
doTestBocchan(1);
}
@Test @Nightly
public void testBocchanBig() throws Exception {
doTestBocchan(100);
}
private void doTestBocchan(int numIterations) throws Exception {
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
this.getClass().getResourceAsStream("bocchan.utf-8")));
String line = reader.readLine();
reader.close();
if (VERBOSE) {
System.out.println("Test for Bocchan without pre-splitting sentences");
}
long totalStart = System.currentTimeMillis();
for (int i = 0; i < numIterations; i++){
segmenter.tokenize(line);
}
if (VERBOSE) {
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
System.out.println("Test for Bocchan with pre-splitting sentences");
}
String[] sentences = line.split("、|。");
totalStart = System.currentTimeMillis();
for (int i = 0; i < numIterations; i++) {
for (String sentence: sentences) {
segmenter.tokenize(sentence);
}
}
if (VERBOSE) {
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
}
}
}

View File

@ -25,18 +25,17 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
public class TestExtendedMode extends BaseTokenStreamTestCase {
private final Segmenter segmenter = new Segmenter(Mode.EXTENDED);
private final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.EXTENDED);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};

View File

@ -18,8 +18,11 @@ package org.apache.lucene.analysis.kuromoji;
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
@ -41,20 +44,103 @@ public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
new int[] { 1, 2, 2, 2 }
);
}
/**
* Test that search mode is enabled and working by default
*/
public void testDecomposition() throws IOException {
assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "シニアソフトウェアエンジニア",
new String[] { "シニア", "ソフトウェア", "エンジニア" }
);
final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, null, Mode.SEARCH,
KuromojiAnalyzer.getDefaultStopSet(),
KuromojiAnalyzer.getDefaultStopTags());
/*
//TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
TokenStream ts = a.tokenStream("foo", new StringReader("&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
ts.reset();
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
while(ts.incrementToken()) {
System.out.println(" " + termAtt.toString());
}
System.out.println("DONE PARSE\n\n");
*/
// Senior software engineer:
assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア",
new String[] { "シニア",
"シニアソフトウェアエンジニア",
"ソフトウェア",
"エンジニア" },
new int[] { 1, 0, 1, 1},
new int[] { 1, 3, 1, 1}
);
// Kansai International Airport:
assertAnalyzesToPositions(a, "関西国際空港",
new String[] { "関西",
"関西国際空港", // zero pos inc
"国際",
"空港" },
new int[] {1, 0, 1, 1},
new int[] {1, 3, 1, 1}
);
// Konika Minolta Holdings; not quite the right
// segmentation (see LUCENE-3726):
assertAnalyzesToPositions(a, "コニカミノルタホールディングス",
new String[] { "コニカ",
"コニカミノルタホールディングス", // zero pos inc
"ミノルタ",
"ホールディングス"},
new int[] {1, 0, 1, 1},
new int[] {1, 3, 1, 1}
);
// Narita Airport
assertAnalyzesToPositions(a, "成田空港",
new String[] { "成田",
"成田空港",
"空港" },
new int[] {1, 0, 1},
new int[] {1, 2, 1}
);
// Kyoto University Baseball Club
assertAnalyzesToPositions(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "京都大学硬式野球部",
new String[] { "京都大",
"",
"硬式",
"野球",
"" },
new int[] {1, 1, 1, 1, 1},
new int[] {1, 1, 1, 1, 1});
// toDotFile(a, "成田空港", "/mnt/scratch/out.dot");
}
/**
* blast random strings against the analyzer
*/
public void testRandom() throws IOException {
checkRandomData(random, new KuromojiAnalyzer(TEST_VERSION_CURRENT), atLeast(10000));
final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, null, Mode.SEARCH,
KuromojiAnalyzer.getDefaultStopSet(),
KuromojiAnalyzer.getDefaultStopTags());
checkRandomData(random, a, atLeast(10000));
}
// Copied from TestKuromojiTokenizer, to make sure passing
// user dict to analyzer works:
public void testUserDict3() throws Exception {
// Test entry that breaks into multiple tokens:
final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, TestKuromojiTokenizer.readDict(),
Mode.SEARCH,
KuromojiAnalyzer.getDefaultStopSet(),
KuromojiAnalyzer.getDefaultStopTags());
assertTokenStreamContents(a.tokenStream("foo", new StringReader("abcd")),
new String[] { "a", "b", "cd" },
new int[] { 0, 1, 2 },
new int[] { 1, 2, 4 },
new Integer(4)
);
}
}

View File

@ -28,7 +28,7 @@ public class TestKuromojiBaseFormFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(reader);
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.DEFAULT_MODE);
return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
}
};

View File

@ -17,7 +17,13 @@ package org.apache.lucene.analysis.kuromoji;
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
@ -25,21 +31,76 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
public static UserDictionary readDict() {
InputStream is = TestKuromojiTokenizer.class.getResourceAsStream("userdict.txt");
if (is == null) {
throw new RuntimeException("Cannot find userdict.txt in test classpath!");
}
try {
try {
Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
return new UserDictionary(reader);
} finally {
is.close();
}
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(reader);
Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
private Analyzer analyzerNormal = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.NORMAL);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
private Analyzer analyzerNoPunct = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
private Analyzer extendedModeAnalyzerNoPunct = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.EXTENDED);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
public void testNormalMode() throws Exception {
assertAnalyzesTo(analyzerNormal,
"シニアソフトウェアエンジニア",
new String[] {"シニアソフトウェアエンジニア"});
}
public void testDecomposition1() throws Exception {
assertAnalyzesTo(analyzer, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
assertAnalyzesTo(analyzerNoPunct, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
"アメリカ低所得者医療援助制度が、今日では、その予算の約3分の1を老人に費やしている。",
new String[] { "本来", "", "貧困", "", "", "女性", "", "子供", "", "医療", "保護", "",
"提供", "する", "ため", "", "創設", "", "", "", "制度", "", "ある", "アメリカ",
@ -55,7 +116,7 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
}
public void testDecomposition2() throws Exception {
assertAnalyzesTo(analyzer, "麻薬の密売は根こそぎ絶やさなければならない",
assertAnalyzesTo(analyzerNoPunct, "麻薬の密売は根こそぎ絶やさなければならない",
new String[] { "麻薬", "", "密売", "", "根こそぎ", "絶やさ", "なけれ", "", "なら", "ない" },
new int[] { 0, 2, 3, 5, 6, 10, 13, 16, 17, 19 },
new int[] { 2, 3, 5, 6, 10, 13, 16, 17, 19, 21 }
@ -63,7 +124,7 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
}
public void testDecomposition3() throws Exception {
assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。",
assertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。",
new String[] { "魔女", "", "大将", "マシュー", "ホプキンス" },
new int[] { 0, 2, 3, 5, 10 },
new int[] { 2, 3, 5, 9, 15 }
@ -91,9 +152,32 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
ts.close();
}
/*
// NOTE: intentionally fails! Just trying to debug this
// one input...
public void testDecomposition6() throws Exception {
assertAnalyzesTo(analyzer, "奈良先端科学技術大学院大学",
new String[] { "これ", "", "", "", "", "ない" },
new int[] { 0, 2, 3, 4, 5, 6 },
new int[] { 2, 3, 4, 5, 6, 8 }
);
}
*/
/** Tests that sentence offset is incorporated into the resulting offsets */
public void testTwoSentences() throws Exception {
assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
/*
//TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
TokenStream ts = analyzer.tokenStream("foo", new StringReader("&#x250cdf66<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
ts.reset();
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
while(ts.incrementToken()) {
System.out.println(" " + termAtt.toString());
}
System.out.println("DONE PARSE\n\n");
*/
assertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
new String[] { "魔女", "", "大将", "マシュー", "ホプキンス", "魔女", "", "大将", "マシュー", "ホプキンス" },
new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
@ -103,6 +187,7 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, analyzerNoPunct, 10000*RANDOM_MULTIPLIER);
}
public void testLargeDocReliability() throws Exception {
@ -125,6 +210,9 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
public void testSurrogates2() throws IOException {
int numIterations = atLeast(10000);
for (int i = 0; i < numIterations; i++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + i);
}
String s = _TestUtil.randomUnicodeString(random, 100);
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
@ -134,22 +222,410 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
}
}
}
public void testOnlyPunctuation() throws IOException {
TokenStream ts = analyzerNoPunct.tokenStream("foo", new StringReader("。、。。"));
ts.reset();
assertFalse(ts.incrementToken());
ts.end();
}
public void testOnlyPunctuationExtended() throws IOException {
TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", new StringReader("......"));
ts.reset();
assertFalse(ts.incrementToken());
ts.end();
}
// note: test is kinda silly since kuromoji emits punctuation tokens.
// but, when/if we filter these out it will be useful.
public void testEnd() throws Exception {
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない")),
assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない")),
new String[] { "これ", "", "", "", "", "ない" },
new int[] { 0, 2, 3, 4, 5, 6 },
new int[] { 2, 3, 4, 5, 6, 8 },
new Integer(8)
);
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない ")),
assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない ")),
new String[] { "これ", "", "", "", "", "ない" },
new int[] { 0, 2, 3, 4, 5, 6, 8 },
new int[] { 2, 3, 4, 5, 6, 8, 9 },
new Integer(12)
);
}
public void testUserDict() throws Exception {
// Not a great test because w/o userdict.txt the
// segmentation is the same:
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("関西国際空港に行った")),
new String[] { "関西", "国際", "空港", "", "行っ", "" },
new int[] { 0, 2, 4, 6, 7, 9 },
new int[] { 2, 4, 6, 7, 9, 10 },
new Integer(10)
);
}
public void testUserDict2() throws Exception {
// Better test: w/o userdict the segmentation is different:
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("朝青龍")),
new String[] { "朝青龍" },
new int[] { 0 },
new int[] { 3 },
new Integer(3)
);
}
public void testUserDict3() throws Exception {
// Test entry that breaks into multiple tokens:
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcd")),
new String[] { "a", "b", "cd" },
new int[] { 0, 1, 2 },
new int[] { 1, 2, 4 },
new Integer(4)
);
}
// HMM: fails (segments as a/b/cd/efghij)... because the
// two paths have exactly equal paths (1 KNOWN + 1
// UNKNOWN) and we don't seem to favor longer KNOWN /
// shorter UNKNOWN matches:
/*
public void testUserDict4() throws Exception {
// Test entry that has another entry as prefix
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcdefghij")),
new String[] { "ab", "cd", "efg", "hij" },
new int[] { 0, 2, 4, 7 },
new int[] { 2, 4, 7, 10 },
new Integer(10)
);
}
*/
public void testSegmentation() throws Exception {
// Skip tests for Michelle Kwan -- UniDic segments Kwan as ワン
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
// String[] surfaceForms = {
// "ミシェル", "", "クワン", "", "優勝", "", "まし", "", "",
// "スペース", "ステーション", "", "行き", "ます", "",
// "うたがわしい", ""
// };
String input = "スペースステーションに行きます。うたがわしい。";
String[] surfaceForms = {
"スペース", "ステーション", "", "行き", "ます", "",
"うたがわしい", ""
};
assertAnalyzesTo(analyzer,
input,
surfaceForms);
}
public void testLatticeToDot() throws Exception {
final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
KuromojiTokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
tokenizer.setGraphvizFormatter(gv2);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
String input = "スペースステーションに行きます。うたがわしい。";
String[] surfaceForms = {
"スペース", "ステーション", "", "行き", "ます", "",
"うたがわしい", ""
};
assertAnalyzesTo(analyzer,
input,
surfaceForms);
assertTrue(gv2.finish().indexOf("22.0") != -1);
}
private void assertReadings(String input, String... readings) throws IOException {
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
ts.reset();
for(String reading : readings) {
assertTrue(ts.incrementToken());
assertEquals(reading, readingAtt.getReading());
}
assertFalse(ts.incrementToken());
ts.end();
}
private void assertPronunciations(String input, String... pronunciations) throws IOException {
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
ts.reset();
for(String pronunciation : pronunciations) {
assertTrue(ts.incrementToken());
assertEquals(pronunciation, readingAtt.getPronunciation());
}
assertFalse(ts.incrementToken());
ts.end();
}
private void assertBaseForms(String input, String... baseForms) throws IOException {
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
ts.reset();
for(String baseForm : baseForms) {
assertTrue(ts.incrementToken());
assertEquals(baseForm, baseFormAtt.getBaseForm());
}
assertFalse(ts.incrementToken());
ts.end();
}
private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
ts.reset();
for(String inflectionType : inflectionTypes) {
assertTrue(ts.incrementToken());
assertEquals(inflectionType, inflectionAtt.getInflectionType());
}
assertFalse(ts.incrementToken());
ts.end();
}
private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
ts.reset();
for(String inflectionForm : inflectionForms) {
assertTrue(ts.incrementToken());
assertEquals(inflectionForm, inflectionAtt.getInflectionForm());
}
assertFalse(ts.incrementToken());
ts.end();
}
private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
ts.reset();
for(String partOfSpeech : partsOfSpeech) {
assertTrue(ts.incrementToken());
assertEquals(partOfSpeech, partOfSpeechAtt.getPartOfSpeech());
}
assertFalse(ts.incrementToken());
ts.end();
}
public void testReadings() throws Exception {
assertReadings("寿司が食べたいです。",
"スシ",
"",
"タベ",
"タイ",
"デス",
"");
}
public void testReadings2() throws Exception {
assertReadings("多くの学生が試験に落ちた。",
"オオク",
"",
"ガクセイ",
"",
"シケン",
"",
"オチ",
"",
"");
}
public void testPronunciations() throws Exception {
assertPronunciations("寿司が食べたいです。",
"スシ",
"",
"タベ",
"タイ",
"デス",
"");
}
public void testPronunciations2() throws Exception {
// pronunciation differs from reading here
assertPronunciations("多くの学生が試験に落ちた。",
"オーク",
"",
"ガクセイ",
"",
"シケン",
"",
"オチ",
"",
"");
}
public void testBasicForms() throws Exception {
assertBaseForms("それはまだ実験段階にあります。",
null,
null,
null,
null,
null,
null,
"ある",
null,
null);
}
public void testInflectionTypes() throws Exception {
assertInflectionTypes("それはまだ実験段階にあります。",
null,
null,
null,
null,
null,
null,
"五段・ラ行",
"特殊・マス",
null);
}
public void testInflectionForms() throws Exception {
assertInflectionForms("それはまだ実験段階にあります。",
null,
null,
null,
null,
null,
null,
"連用形",
"基本形",
null);
}
public void testPartOfSpeech() throws Exception {
assertPartsOfSpeech("それはまだ実験段階にあります。",
"名詞-代名詞-一般",
"助詞-係助詞",
"副詞-助詞類接続",
"名詞-サ変接続",
"名詞-一般",
"助詞-格助詞-一般",
"動詞-自立",
"助動詞",
"記号-句点");
}
// TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
// do we have a possibility to actually lookup the first and last word from dictionary?
public void testYabottai() throws Exception {
assertAnalyzesTo(analyzer, "やぼったい",
new String[] {"やぼったい"});
}
public void testTsukitosha() throws Exception {
assertAnalyzesTo(analyzer, "突き通しゃ",
new String[] {"突き通しゃ"});
}
public void testBocchan() throws Exception {
doTestBocchan(1);
}
@Nightly
public void testBocchanBig() throws Exception {
doTestBocchan(100);
}
/*
public void testWikipedia() throws Exception {
final FileInputStream fis = new FileInputStream("/q/lucene/jawiki-20120220-pages-articles.xml");
final Reader r = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
final long startTimeNS = System.nanoTime();
boolean done = false;
long compoundCount = 0;
long nonCompoundCount = 0;
long netOffset = 0;
while (!done) {
final TokenStream ts = analyzer.tokenStream("ignored", r);
ts.reset();
final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
int count = 0;
while (true) {
if (!ts.incrementToken()) {
done = true;
break;
}
count++;
if (posIncAtt.getPositionIncrement() == 0) {
compoundCount++;
} else {
nonCompoundCount++;
if (nonCompoundCount % 1000000 == 0) {
System.out.println(String.format("%.2f msec [pos=%d, %d, %d]",
(System.nanoTime()-startTimeNS)/1000000.0,
netOffset + offsetAtt.startOffset(),
nonCompoundCount,
compoundCount));
}
}
if (count == 100000000) {
System.out.println(" again...");
break;
}
}
ts.end();
netOffset += offsetAtt.endOffset();
}
System.out.println("compoundCount=" + compoundCount + " nonCompoundCount=" + nonCompoundCount);
r.close();
}
*/
private void doTestBocchan(int numIterations) throws Exception {
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
this.getClass().getResourceAsStream("bocchan.utf-8")));
String line = reader.readLine();
reader.close();
if (VERBOSE) {
System.out.println("Test for Bocchan without pre-splitting sentences");
}
/*
if (numIterations > 1) {
// warmup
for (int i = 0; i < numIterations; i++) {
final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
ts.reset();
while(ts.incrementToken());
}
}
*/
long totalStart = System.currentTimeMillis();
for (int i = 0; i < numIterations; i++) {
final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
ts.reset();
while(ts.incrementToken());
}
String[] sentences = line.split("、|。");
if (VERBOSE) {
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
System.out.println("Test for Bocchan with pre-splitting sentences (" + sentences.length + " sentences)");
}
totalStart = System.currentTimeMillis();
for (int i = 0; i < numIterations; i++) {
for (String sentence: sentences) {
final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(sentence));
ts.reset();
while(ts.incrementToken());
}
}
if (VERBOSE) {
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
}
}
}

View File

@ -27,20 +27,19 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.util.IOUtils;
public class TestSearchMode extends BaseTokenStreamTestCase {
private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
private final Segmenter segmenter = new Segmenter(Mode.SEARCH);
private final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.SEARCH);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
/** Test search mode segmentation */
public void testSearchSegmentation() throws IOException {
InputStream is = TestSearchMode.class.getResourceAsStream(SEGMENTATION_FILENAME);
@ -63,7 +62,18 @@ public class TestSearchMode extends BaseTokenStreamTestCase {
String[] fields = line.split("\t", 2);
String sourceText = fields[0];
String[] expectedTokens = fields[1].split("\\s+");
assertAnalyzesTo(analyzer, sourceText, expectedTokens);
int[] expectedPosIncrs = new int[expectedTokens.length];
int[] expectedPosLengths = new int[expectedTokens.length];
for(int tokIDX=0;tokIDX<expectedTokens.length;tokIDX++) {
if (expectedTokens[tokIDX].endsWith("/0")) {
expectedTokens[tokIDX] = expectedTokens[tokIDX].replace("/0", "");
expectedPosLengths[tokIDX] = expectedTokens.length-1;
} else {
expectedPosIncrs[tokIDX] = 1;
expectedPosLengths[tokIDX] = 1;
}
}
assertAnalyzesTo(analyzer, sourceText, expectedTokens, expectedPosIncrs);
}
} finally {
is.close();

View File

@ -23,29 +23,17 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.io.IOException;
import org.apache.lucene.analysis.kuromoji.SegmenterTest;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.analysis.kuromoji.TestKuromojiTokenizer;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class UserDictionaryTest extends LuceneTestCase {
private UserDictionary readDict() throws IOException {
InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt");
if (is == null)
throw new FileNotFoundException("Cannot find userdict.txt in test classpath!");
try {
Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
return new UserDictionary(reader);
} finally {
is.close();
}
}
@Test
public void testLookup() throws IOException {
UserDictionary dictionary = readDict();
UserDictionary dictionary = TestKuromojiTokenizer.readDict();
String s = "関西国際空港に行った";
int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
// Length should be three 関西, 国際, 空港
@ -69,7 +57,7 @@ public class UserDictionaryTest extends LuceneTestCase {
@Test
public void testReadings() throws IOException {
UserDictionary dictionary = readDict();
UserDictionary dictionary = TestKuromojiTokenizer.readDict();
int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
@ -83,7 +71,7 @@ public class UserDictionaryTest extends LuceneTestCase {
@Test
public void testPartOfSpeech() throws IOException {
UserDictionary dictionary = readDict();
UserDictionary dictionary = TestKuromojiTokenizer.readDict();
int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdKeizai = result[1][0]; // wordId of 経済 in 日本経済新聞
@ -92,7 +80,7 @@ public class UserDictionaryTest extends LuceneTestCase {
@Test
public void testRead() throws IOException {
UserDictionary dictionary = readDict();
UserDictionary dictionary = TestKuromojiTokenizer.readDict();
assertNotNull(dictionary);
}
}

View File

@ -25,43 +25,45 @@
##
# Kansai Internationl Airport
関西国際空港 関西 国際 空港
関西国際空港 関西 関西国際空港/0 国際 空港
# Narita Airport
成田空港 成田 空港
成田空港 成田 成田空港/0 空港
# Haneda Airport
羽田空港 羽田 空港
羽田空港 羽田 羽田空港/0 空港
# Nara Institute of Science and Technology
奈良先端科学技術大学院大学 奈良 先端 科学 技術 大学院 大学
奈良先端科学技術大学院大学 奈良 奈良先端科学技術大学院大学/0 先端 科学 技術 大学院 大学
# Tokyo University
東京大学 東京 大学
東京大学 東京 東京大学/0 大学
# Kyoto University
京都大学 京都 大学
京都大学 京都 京都大学/0 大学
# NOTE: differs from non-compound mode:
# Kyoto University Baseball Club
京都大学硬式野球部 京都 大学 硬式 野球 部
京都大学硬式野球部 京都大 学 硬式 野球 部
##
## Katakana titles
##
# Senior Software Engineer
シニアソフトウェアエンジニア シニア ソフトウェア エンジニア
シニアソフトウェアエンジニア シニア シニアソフトウェアエンジニア/0 ソフトウェア エンジニア
# Software Engineer
ソフトウェアエンジニア ソフトウェア エンジニア
# Senior Project Manager
シニアプロジェクトマネジャー シニア プロジェクト マネジャー
シニアプロジェクトマネジャー シニア シニアプロジェクトマネジャー/0 プロジェクト マネジャー
# Project Manager
プロジェクトマネジャー プロジェクト マネジャー
# Senior Sales Engineer
シニアセールスエンジニア シニア セールス エンジニア
シニアセールスエンジニア シニア シニアセールスエンジニア/0 セールス エンジニア
# System Architect
システムアーキテクト システム アーキテクト
システムアーキテクト システム システムアーキテクト/0 アーキテクト
# Senior System Architect
シニアシステムアーキテクト シニア システム アーキテクト
シニアシステムアーキテクト シニア シニアシステムアーキテクト/0 システム アーキテクト
# System Administrator
システムアドミニストレータ システム アドミニストレータ
システムアドミニストレーター システム アドミニストレーター
システムアドミニストレーター システム システムアドミニストレーター/0 アドミニストレーター
# Senior System Administrator
シニアシステムアドミニストレーター シニア システム アドミニストレーター
シニアシステムアドミニストレーター シニア シニアシステムアドミニストレーター/0 システム アドミニストレーター
##
## Company names (several are fictitious)
@ -70,25 +72,25 @@
# SoftBank Mobile
ソフトバンクモバイル ソフトバンク モバイル
# Alpine Materials
アルパインマテリアルズ アルパイン マテリアルズ
アルパインマテリアルズ アルパイン アルパインマテリアルズ/0 マテリアルズ
# Sapporo Holdings
サッポロホールディングス サッポロ ホールディングス
# Yamada Corporation
ヤマダコーポレーション ヤマダ コーポレーション
ヤマダコーポレーション ヤマダ ヤマダコーポレーション/0 コーポレーション
# Canon Semiconductor equipement NOTE: Semiconductor becomes semi + conductor
キヤノンセミコンダクターエクィップメント キヤノン セミ コンダクター エクィップメント
キヤノンセミコンダクターエクィップメント キヤノン キヤノンセミコンダクターエクィップメント/0 セミ コンダクター エクィップメント
# Orental Chain
オリエンタルチエン オリエンタル チエン
オリエンタルチエン オリエンタル オリエンタルチエン/0 チエン
# Ally Projects Japan NOTE: Becomes one token as プロジェクツ is not in IPADIC
アーリープロジェクツジャパン アーリープロジェクツジャパン
# Peter Pan Corporation
ピーターパンコーポレーション ピーター パン コーポレーション
ピーターパンコーポレーション ピーター ピーターパンコーポレーション/0 パン コーポレーション
# AIM Create
エイムクリエイツ エイムクリエイツ
# Mars Engineering
マースエンジニアリング マース エンジニアリング
マースエンジニアリング マース マースエンジニアリング/0 エンジニアリング
# Fuji Protein Technology
フジプロテインテクノロジー フジ プロテイン テクノロジー
フジプロテインテクノロジー フジ フジプロテインテクノロジー/0 プロテイン テクノロジー
##
## Person names
@ -100,7 +102,7 @@
スティーブジョブズ スティーブ ジョブズ
# Harry Potter NOTE: Becomes one token (short word)
ハリーポッター ハリーポッター
# Bill Gates NOTE: Becomes one token (short work)
# Bill Gates NOTE: Becomes one token (short word)
ビルゲイツ ビルゲイツ
# Sean Connery NOTE: Becomes one token (okay)
ショーンコネリー ショーンコネリー
@ -133,8 +135,8 @@
##
# JT Engineering NOTE: Becomes J Tien ginia ring (substrings are in IPADIC)
ジェイティエンジニアリング ジェイ ティエン ジニア リング
ジェイティエンジニアリング ジェイ ジェイティエンジニアリング/0 ティエン ジニア リング
# Anchovy pasta NOTE: Become Anch yvipasta
アンチョビパスタ アンチ ョビパスタ
アンチョビパスタ アンチ アンチョビパスタ/0 ョビパスタ
# Surprise gift NOTE: Becomes one token (surprise not in IPADIC)
サプライズギフト サプライズギフト

View File

@ -4,3 +4,7 @@
# Custom reading for sumo wrestler
朝青龍,朝青龍,アサショウリュウ,カスタム人名
# Silly entry:
abcd,a b cd,foo1 foo2 foo3,bar
abcdefg,ab cd efg,foo1 foo2 foo4,bar

View File

@ -28,8 +28,7 @@ import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
import org.apache.lucene.analysis.kuromoji.Segmenter;
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.analysis.BaseTokenizerFactory;
@ -88,7 +87,7 @@ public class KuromojiTokenizerFactory extends BaseTokenizerFactory implements Re
@Override
public Tokenizer create(Reader input) {
return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
return new KuromojiTokenizer(input, userDictionary, true, mode);
}
private Mode getMode(Map<String, String> args) {
@ -96,7 +95,7 @@ public class KuromojiTokenizerFactory extends BaseTokenizerFactory implements Re
if (mode != null) {
return Mode.valueOf(mode.toUpperCase(Locale.ENGLISH));
} else {
return Segmenter.DEFAULT_MODE;
return KuromojiTokenizer.DEFAULT_MODE;
}
}
}

View File

@ -50,7 +50,7 @@ public class TestKuromojiTokenizerFactory extends BaseTokenTestCase {
factory.inform(new SolrResourceLoader(null, null));
TokenStream ts = factory.create(new StringReader("シニアソフトウェアエンジニア"));
assertTokenStreamContents(ts,
new String[] { "ニア", "ソフトウェア", "エンジニア" }
new String[] { "ニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" }
);
}