mirror of https://github.com/apache/lucene.git
Kuromoji now produces both compound words and the segmentation of those words in search mode (LUCENE-3767)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1296805 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ff0650ffa4
commit
430365f7cc
|
@ -154,6 +154,9 @@ New Features
|
|||
* LUCENE-3730: Refine Kuromoji search mode (Mode.SEARCH) decompounding
|
||||
heuristics. (Christian Moen via Robert Muir)
|
||||
|
||||
* LUCENE-3767: Kuromoji tokenizer/analyzer produces both compound words
|
||||
and the segmentation of that compound in Mode.SEARCH. (Robert Muir, Mike McCandless via Christian Moen)
|
||||
|
||||
* LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous
|
||||
BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do
|
||||
joins in both parent to child and child to parent directions.
|
||||
|
|
|
@ -52,9 +52,10 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
|
|||
* @param positionIncrement the distance from the prior term
|
||||
*/
|
||||
public void setPositionIncrement(int positionIncrement) {
|
||||
if (positionIncrement < 0)
|
||||
if (positionIncrement < 0) {
|
||||
throw new IllegalArgumentException
|
||||
("Increment must be zero or greater: " + positionIncrement);
|
||||
("Increment must be zero or greater: got " + positionIncrement);
|
||||
}
|
||||
this.positionIncrement = positionIncrement;
|
||||
}
|
||||
|
||||
|
@ -77,7 +78,8 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
|
|||
}
|
||||
|
||||
if (other instanceof PositionIncrementAttributeImpl) {
|
||||
return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement;
|
||||
PositionIncrementAttributeImpl _other = (PositionIncrementAttributeImpl) other;
|
||||
return positionIncrement == _other.positionIncrement;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
@ -93,5 +95,4 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
|
|||
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
|
||||
t.setPositionIncrement(positionIncrement);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
/** The positionLength determines how many positions this
|
||||
* token spans. Very few analyzer components actually
|
||||
* produce this attribute, and indexing ignores it, but
|
||||
* it's useful to express the graph structure naturally
|
||||
* produced by decompounding, word splitting/joining,
|
||||
* synonym filtering, etc.
|
||||
*
|
||||
* <p>The default value is one. */
|
||||
|
||||
public interface PositionLengthAttribute extends Attribute {
|
||||
/** @param positionLength how many positions this token
|
||||
* spans. */
|
||||
public void setPositionLength(int positionLength);
|
||||
|
||||
/** Returns the position length of this Token.
|
||||
* @see #setPositionLength
|
||||
*/
|
||||
public int getPositionLength();
|
||||
}
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
package org.apache.lucene.analysis.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
/** See {@link PositionLengthAttribute}. */
|
||||
public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable {
|
||||
private int positionLength = 1;
|
||||
|
||||
/** @param positionLength how many positions this token
|
||||
* spans. NOTE: this is optional, and most analyzers
|
||||
* don't change the default value (1). */
|
||||
public void setPositionLength(int positionLength) {
|
||||
if (positionLength < 1) {
|
||||
throw new IllegalArgumentException
|
||||
("Position length must be 1 or greater: got " + positionLength);
|
||||
}
|
||||
this.positionLength = positionLength;
|
||||
}
|
||||
|
||||
/** Returns the position length of this Token.
|
||||
* @see #setPositionLength
|
||||
*/
|
||||
public int getPositionLength() {
|
||||
return positionLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
this.positionLength = 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof PositionLengthAttributeImpl) {
|
||||
PositionLengthAttributeImpl _other = (PositionLengthAttributeImpl) other;
|
||||
return positionLength == _other.positionLength;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return positionLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
PositionLengthAttribute t = (PositionLengthAttribute) target;
|
||||
t.setPositionLength(positionLength);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,148 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/** Acts like a forever growing char[] as you read
|
||||
* characters into it from the provided reader, but
|
||||
* internally it uses a circular buffer to only hold the
|
||||
* characters that haven't been freed yet. This is like a
|
||||
* PushbackReader, except you don't have to specify
|
||||
* up-front the max size of the buffer, but you do have to
|
||||
* periodically call {@link #freeBefore}. */
|
||||
|
||||
public final class RollingCharBuffer {
|
||||
|
||||
private Reader reader;
|
||||
|
||||
private char[] buffer = new char[32];
|
||||
|
||||
// Next array index to write to in buffer:
|
||||
private int nextWrite;
|
||||
|
||||
// Next absolute position to read from reader:
|
||||
private int nextPos;
|
||||
|
||||
// How many valid chars (wrapped) are in the buffer:
|
||||
private int count;
|
||||
|
||||
// True if we hit EOF
|
||||
private boolean end;
|
||||
|
||||
/** Clear array and switch to new reader. */
|
||||
public void reset(Reader reader) {
|
||||
this.reader = reader;
|
||||
nextPos = 0;
|
||||
nextWrite = 0;
|
||||
count = 0;
|
||||
end = false;
|
||||
}
|
||||
|
||||
/* Absolute position read. NOTE: pos must not jump
|
||||
* ahead by more than 1! Ie, it's OK to read arbitarily
|
||||
* far back (just not prior to the last {@link
|
||||
* #freeBefore}), but NOT ok to read arbitrarily far
|
||||
* ahead. Returns -1 if you hit EOF. */
|
||||
public int get(int pos) throws IOException {
|
||||
//System.out.println(" get pos=" + pos + " nextPos=" + nextPos + " count=" + count);
|
||||
if (pos == nextPos) {
|
||||
if (end) {
|
||||
return -1;
|
||||
}
|
||||
final int ch = reader.read();
|
||||
if (ch == -1) {
|
||||
end = true;
|
||||
return -1;
|
||||
}
|
||||
if (count == buffer.length) {
|
||||
// Grow
|
||||
final char[] newBuffer = new char[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
//System.out.println(Thread.currentThread().getName() + ": cb grow " + newBuffer.length);
|
||||
System.arraycopy(buffer, nextWrite, newBuffer, 0, buffer.length - nextWrite);
|
||||
System.arraycopy(buffer, 0, newBuffer, buffer.length - nextWrite, nextWrite);
|
||||
nextWrite = buffer.length;
|
||||
buffer = newBuffer;
|
||||
}
|
||||
if (nextWrite == buffer.length) {
|
||||
nextWrite = 0;
|
||||
}
|
||||
buffer[nextWrite++] = (char) ch;
|
||||
count++;
|
||||
nextPos++;
|
||||
return ch;
|
||||
} else {
|
||||
// Cannot read from future (except by 1):
|
||||
assert pos < nextPos;
|
||||
|
||||
// Cannot read from already freed past:
|
||||
assert nextPos - pos <= count;
|
||||
|
||||
final int index = getIndex(pos);
|
||||
return buffer[index];
|
||||
}
|
||||
}
|
||||
|
||||
// For assert:
|
||||
private boolean inBounds(int pos) {
|
||||
return pos >= 0 && pos < nextPos && pos >= nextPos - count;
|
||||
}
|
||||
|
||||
private int getIndex(int pos) {
|
||||
int index = nextWrite - (nextPos - pos);
|
||||
if (index < 0) {
|
||||
// Wrap:
|
||||
index += buffer.length;
|
||||
assert index >= 0;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
public char[] get(int posStart, int length) {
|
||||
assert length > 0;
|
||||
assert inBounds(posStart): "posStart=" + posStart + " length=" + length;
|
||||
//System.out.println(" buffer.get posStart=" + posStart + " len=" + length);
|
||||
|
||||
final int startIndex = getIndex(posStart);
|
||||
final int endIndex = getIndex(posStart + length);
|
||||
//System.out.println(" startIndex=" + startIndex + " endIndex=" + endIndex);
|
||||
|
||||
final char[] result = new char[length];
|
||||
if (endIndex >= startIndex && length < buffer.length) {
|
||||
System.arraycopy(buffer, startIndex, result, 0, endIndex-startIndex);
|
||||
} else {
|
||||
// Wrapped:
|
||||
final int part1 = buffer.length-startIndex;
|
||||
System.arraycopy(buffer, startIndex, result, 0, part1);
|
||||
System.arraycopy(buffer, 0, result, buffer.length-startIndex, length-part1);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Call this to notify us that no chars before this
|
||||
* absolute position are needed anymore. */
|
||||
public void freeBefore(int pos) {
|
||||
assert pos >= 0;
|
||||
assert pos <= nextPos;
|
||||
final int newCount = nextPos - pos;
|
||||
assert newCount <= count: "newCount=" + newCount + " count=" + count;
|
||||
assert newCount <= buffer.length: "newCount=" + newCount + " buf.length=" + buffer.length;
|
||||
count = newCount;
|
||||
}
|
||||
}
|
|
@ -840,6 +840,7 @@ public final class FST<T> {
|
|||
}
|
||||
|
||||
public Arc<T> readFirstRealTargetArc(int node, Arc<T> arc, final BytesReader in) throws IOException {
|
||||
assert in.bytes == bytes;
|
||||
final int address = getNodeAddress(node);
|
||||
in.pos = address;
|
||||
//System.out.println(" readFirstRealTargtArc address="
|
||||
|
@ -936,6 +937,7 @@ public final class FST<T> {
|
|||
/** Never returns null, but you should never call this if
|
||||
* arc.isLast() is true. */
|
||||
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
|
||||
assert in.bytes == bytes;
|
||||
|
||||
// TODO: can't assert this because we call from readFirstArc
|
||||
// assert !flag(arc.flags, BIT_LAST_ARC);
|
||||
|
@ -1019,6 +1021,7 @@ public final class FST<T> {
|
|||
* This returns null if the arc was not found, else the incoming arc. */
|
||||
public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException {
|
||||
assert cachedRootArcs != null;
|
||||
assert in.bytes == bytes;
|
||||
|
||||
if (labelToMatch == END_LABEL) {
|
||||
if (follow.isFinal()) {
|
||||
|
@ -1225,17 +1228,20 @@ public final class FST<T> {
|
|||
|
||||
/** Expert */
|
||||
public static abstract class BytesReader extends DataInput {
|
||||
int pos;
|
||||
protected int pos;
|
||||
protected final byte[] bytes;
|
||||
protected BytesReader(byte[] bytes, int pos) {
|
||||
this.bytes = bytes;
|
||||
this.pos = pos;
|
||||
}
|
||||
abstract void skip(int byteCount);
|
||||
abstract void skip(int base, int byteCount);
|
||||
}
|
||||
|
||||
final static class ReverseBytesReader extends BytesReader {
|
||||
final byte[] bytes;
|
||||
|
||||
public ReverseBytesReader(byte[] bytes, int pos) {
|
||||
this.bytes = bytes;
|
||||
this.pos = pos;
|
||||
super(bytes, pos);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1262,11 +1268,9 @@ public final class FST<T> {
|
|||
// TODO: can we use just ByteArrayDataInput...? need to
|
||||
// add a .skipBytes to DataInput.. hmm and .setPosition
|
||||
final static class ForwardBytesReader extends BytesReader {
|
||||
final byte[] bytes;
|
||||
|
||||
public ForwardBytesReader(byte[] bytes, int pos) {
|
||||
this.bytes = bytes;
|
||||
this.pos = pos;
|
||||
super(bytes, pos);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -29,6 +29,8 @@ public class TestSimpleAttributeImpl extends LuceneTestCase {
|
|||
public void testAttributes() {
|
||||
_TestUtil.assertAttributeReflection(new PositionIncrementAttributeImpl(),
|
||||
Collections.singletonMap(PositionIncrementAttribute.class.getName()+"#positionIncrement", 1));
|
||||
_TestUtil.assertAttributeReflection(new PositionLengthAttributeImpl(),
|
||||
Collections.singletonMap(PositionLengthAttribute.class.getName()+"#positionLength", 1));
|
||||
_TestUtil.assertAttributeReflection(new FlagsAttributeImpl(),
|
||||
Collections.singletonMap(FlagsAttribute.class.getName()+"#flags", 0));
|
||||
_TestUtil.assertAttributeReflection(new TypeAttributeImpl(),
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
public class TestRollingCharBuffer extends LuceneTestCase {
|
||||
|
||||
public void test() throws Exception {
|
||||
final int ITERS = atLeast(1000);
|
||||
|
||||
RollingCharBuffer buffer = new RollingCharBuffer();
|
||||
|
||||
for(int iter=0;iter<ITERS;iter++) {
|
||||
final int stringLen = random.nextBoolean() ? random.nextInt(50) : random.nextInt(20000);
|
||||
final String s;
|
||||
if (stringLen == 0) {
|
||||
s = "";
|
||||
} else {
|
||||
s = _TestUtil.randomUnicodeString(random, stringLen);
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println("\nTEST: iter=" + iter + " s.length()=" + s.length());
|
||||
}
|
||||
buffer.reset(new StringReader(s));
|
||||
int nextRead = 0;
|
||||
int availCount = 0;
|
||||
while(nextRead < s.length()) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" cycle nextRead=" + nextRead + " avail=" + availCount);
|
||||
}
|
||||
if (availCount == 0 || random.nextBoolean()) {
|
||||
// Read next char
|
||||
if (VERBOSE) {
|
||||
System.out.println(" new char");
|
||||
}
|
||||
assertEquals(s.charAt(nextRead), buffer.get(nextRead));
|
||||
nextRead++;
|
||||
availCount++;
|
||||
} else if (random.nextBoolean()) {
|
||||
// Read previous char
|
||||
int pos = _TestUtil.nextInt(random, nextRead-availCount, nextRead-1);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" old char pos=" + pos);
|
||||
}
|
||||
assertEquals(s.charAt(pos), buffer.get(pos));
|
||||
} else {
|
||||
// Read slice
|
||||
int length;
|
||||
if (availCount == 1) {
|
||||
length = 1;
|
||||
} else {
|
||||
length = _TestUtil.nextInt(random, 1, availCount);
|
||||
}
|
||||
int start;
|
||||
if (length == availCount) {
|
||||
start = nextRead - availCount;
|
||||
} else {
|
||||
start = nextRead - availCount + random.nextInt(availCount-length);
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println(" slice start=" + start + " length=" + length);
|
||||
}
|
||||
assertEquals(s.substring(start, start+length),
|
||||
new String(buffer.get(start, length)));
|
||||
}
|
||||
|
||||
if (availCount > 0 && random.nextInt(20) == 17) {
|
||||
final int toFree = random.nextInt(availCount);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" free " + toFree + " (avail=" + (availCount-toFree) + ")");
|
||||
}
|
||||
buffer.freeBefore(nextRead-(availCount-toFree));
|
||||
availCount -= toFree;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,13 +17,18 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.io.Writer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
@ -83,7 +88,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
|
||||
assertNotNull(output);
|
||||
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
|
||||
|
||||
|
@ -107,6 +112,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class));
|
||||
posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
PositionLengthAttribute posLengthAtt = null;
|
||||
if (posLengths != null) {
|
||||
assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
|
||||
posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
|
||||
}
|
||||
|
||||
ts.reset();
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
|
@ -116,6 +127,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
if (offsetAtt != null) offsetAtt.setOffset(14584724,24683243);
|
||||
if (typeAtt != null) typeAtt.setType("bogusType");
|
||||
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
|
||||
if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
|
||||
|
||||
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
|
||||
assertTrue("token "+i+" does not exist", ts.incrementToken());
|
||||
|
@ -130,6 +142,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertEquals("type "+i, types[i], typeAtt.type());
|
||||
if (posIncrements != null)
|
||||
assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
|
||||
if (posLengths != null)
|
||||
assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength());
|
||||
|
||||
// we can enforce some basic things about a few attributes even if the caller doesn't check:
|
||||
if (offsetAtt != null) {
|
||||
|
@ -138,14 +152,18 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTrue("endOffset must be >= startOffset", offsetAtt.endOffset() >= offsetAtt.startOffset());
|
||||
if (finalOffset != null) {
|
||||
assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
|
||||
assertTrue("endOffset must be <= finalOffset", offsetAtt.endOffset() <= finalOffset.intValue());
|
||||
assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(),
|
||||
offsetAtt.endOffset() <= finalOffset.intValue());
|
||||
}
|
||||
}
|
||||
if (posIncrAtt != null) {
|
||||
assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0);
|
||||
}
|
||||
if (posLengthAtt != null) {
|
||||
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
|
||||
}
|
||||
}
|
||||
assertFalse("end of stream", ts.incrementToken());
|
||||
assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
|
||||
ts.end();
|
||||
if (finalOffset != null)
|
||||
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
|
||||
|
@ -155,65 +173,81 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
ts.close();
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null);
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
|
||||
assertTokenStreamContents(ts, output, null, null, null, null, null);
|
||||
assertTokenStreamContents(ts, output, null, null, null, null, null, null);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
|
||||
assertTokenStreamContents(ts, output, null, null, types, null, null);
|
||||
assertTokenStreamContents(ts, output, null, null, types, null, null, null);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
|
||||
assertTokenStreamContents(ts, output, null, null, null, posIncrements, null);
|
||||
assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null);
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, finalOffset);
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null);
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, finalOffset);
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset);
|
||||
}
|
||||
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, null, null, null, null);
|
||||
assertAnalyzesTo(a, input, output, null, null, null, null, null);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, null, null, types, null);
|
||||
assertAnalyzesTo(a, input, output, null, null, types, null, null);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
|
||||
assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
|
||||
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements);
|
||||
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null);
|
||||
}
|
||||
|
||||
|
||||
public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, input.length());
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
|
||||
}
|
||||
|
||||
public static void assertAnalyzesToReuse(Analyzer a, String input, String[] output) throws IOException {
|
||||
|
@ -326,7 +360,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
}
|
||||
|
||||
int remainder = random.nextInt(10);
|
||||
|
@ -336,10 +370,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
|
||||
OffsetAttribute offsetAtt = ts.hasAttribute(OffsetAttribute.class) ? ts.getAttribute(OffsetAttribute.class) : null;
|
||||
PositionIncrementAttribute posIncAtt = ts.hasAttribute(PositionIncrementAttribute.class) ? ts.getAttribute(PositionIncrementAttribute.class) : null;
|
||||
PositionLengthAttribute posLengthAtt = ts.hasAttribute(PositionLengthAttribute.class) ? ts.getAttribute(PositionLengthAttribute.class) : null;
|
||||
TypeAttribute typeAtt = ts.hasAttribute(TypeAttribute.class) ? ts.getAttribute(TypeAttribute.class) : null;
|
||||
List<String> tokens = new ArrayList<String>();
|
||||
List<String> types = new ArrayList<String>();
|
||||
List<Integer> positions = new ArrayList<Integer>();
|
||||
List<Integer> positionLengths = new ArrayList<Integer>();
|
||||
List<Integer> startOffsets = new ArrayList<Integer>();
|
||||
List<Integer> endOffsets = new ArrayList<Integer>();
|
||||
ts.reset();
|
||||
|
@ -347,6 +383,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
tokens.add(termAtt.toString());
|
||||
if (typeAtt != null) types.add(typeAtt.type());
|
||||
if (posIncAtt != null) positions.add(posIncAtt.getPositionIncrement());
|
||||
if (posLengthAtt != null) positionLengths.add(posLengthAtt.getPositionLength());
|
||||
if (offsetAtt != null) {
|
||||
startOffsets.add(offsetAtt.startOffset());
|
||||
endOffsets.add(offsetAtt.endOffset());
|
||||
|
@ -357,11 +394,21 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
// verify reusing is "reproducable" and also get the normal tokenstream sanity checks
|
||||
if (!tokens.isEmpty()) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("NOTE: BaseTokenStreamTestCase: re-run analysis");
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
|
||||
}
|
||||
reader = new StringReader(text);
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength + type
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos + type
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
|
@ -369,7 +416,18 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
} else if (posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -378,6 +436,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
toIntArray(endOffsets),
|
||||
null,
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
} else if (offsetAtt != null) {
|
||||
// offset
|
||||
|
@ -387,6 +446,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
toIntArray(endOffsets),
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
text.length());
|
||||
} else {
|
||||
// terms only
|
||||
|
@ -396,6 +456,22 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected String toDot(Analyzer a, String inputText) throws IOException {
|
||||
final StringWriter sw = new StringWriter();
|
||||
final TokenStream ts = a.tokenStream("field", new StringReader(inputText));
|
||||
ts.reset();
|
||||
new TokenStreamToDot(inputText, ts, new PrintWriter(sw)).toDot();
|
||||
return sw.toString();
|
||||
}
|
||||
|
||||
protected void toDotFile(Analyzer a, String inputText, String localFileName) throws IOException {
|
||||
Writer w = new OutputStreamWriter(new FileOutputStream(localFileName), "UTF-8");
|
||||
final TokenStream ts = a.tokenStream("field", new StringReader(inputText));
|
||||
ts.reset();
|
||||
new TokenStreamToDot(inputText, ts, new PrintWriter(w)).toDot();
|
||||
w.close();
|
||||
}
|
||||
|
||||
static int[] toIntArray(List<Integer> list) {
|
||||
int ret[] = new int[list.size()];
|
||||
|
|
|
@ -0,0 +1,159 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.PrintWriter;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
|
||||
/** Consumes a TokenStream and outputs the dot (graphviz) string (graph). */
|
||||
public class TokenStreamToDot {
|
||||
|
||||
private final TokenStream in;
|
||||
private final CharTermAttribute termAtt;
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
private final PositionLengthAttribute posLengthAtt;
|
||||
private final OffsetAttribute offsetAtt;
|
||||
private final String inputText;
|
||||
protected final PrintWriter out;
|
||||
|
||||
/** If inputText is non-null, and the TokenStream has
|
||||
* offsets, we include the surface form in each arc's
|
||||
* label. */
|
||||
public TokenStreamToDot(String inputText, TokenStream in, PrintWriter out) {
|
||||
this.in = in;
|
||||
this.out = out;
|
||||
this.inputText = inputText;
|
||||
termAtt = in.addAttribute(CharTermAttribute.class);
|
||||
posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
|
||||
posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
|
||||
if (in.hasAttribute(OffsetAttribute.class)) {
|
||||
offsetAtt = in.addAttribute(OffsetAttribute.class);
|
||||
} else {
|
||||
offsetAtt = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void toDot() throws IOException {
|
||||
in.reset();
|
||||
writeHeader();
|
||||
|
||||
// TODO: is there some way to tell dot that it should
|
||||
// make the "main path" a straight line and have the
|
||||
// non-sausage arcs not affect node placement...
|
||||
|
||||
int pos = -1;
|
||||
int lastEndPos = -1;
|
||||
while (in.incrementToken()) {
|
||||
final boolean isFirst = pos == -1;
|
||||
int posInc = posIncAtt.getPositionIncrement();
|
||||
if (isFirst && posInc == 0) {
|
||||
// TODO: hmm are TS's still allowed to do this...?
|
||||
System.err.println("WARNING: first posInc was 0; correcting to 1");
|
||||
posInc = 1;
|
||||
}
|
||||
|
||||
if (posInc > 0) {
|
||||
// New node:
|
||||
pos += posInc;
|
||||
writeNode(pos, Integer.toString(pos));
|
||||
}
|
||||
|
||||
if (posInc > 1) {
|
||||
// Gap!
|
||||
writeArc(lastEndPos, pos, null, "dotted");
|
||||
}
|
||||
|
||||
if (isFirst) {
|
||||
writeNode(-1, null);
|
||||
writeArc(-1, pos, null, null);
|
||||
}
|
||||
|
||||
String arcLabel = termAtt.toString();
|
||||
if (offsetAtt != null) {
|
||||
final int startOffset = offsetAtt.startOffset();
|
||||
final int endOffset = offsetAtt.endOffset();
|
||||
//System.out.println("start=" + startOffset + " end=" + endOffset + " len=" + inputText.length());
|
||||
if (inputText != null) {
|
||||
arcLabel += " / " + inputText.substring(startOffset, endOffset);
|
||||
} else {
|
||||
arcLabel += " / " + startOffset + "-" + endOffset;
|
||||
}
|
||||
}
|
||||
|
||||
writeArc(pos, pos + posLengthAtt.getPositionLength(), arcLabel, null);
|
||||
lastEndPos = pos + posLengthAtt.getPositionLength();
|
||||
}
|
||||
|
||||
in.end();
|
||||
|
||||
if (lastEndPos != -1) {
|
||||
// TODO: should we output any final text (from end
|
||||
// offsets) on this arc...?
|
||||
writeNode(-2, null);
|
||||
writeArc(lastEndPos, -2, null, null);
|
||||
}
|
||||
|
||||
writeTrailer();
|
||||
}
|
||||
|
||||
protected void writeArc(int fromNode, int toNode, String label, String style) {
|
||||
out.print(" " + fromNode + " -> " + toNode + " [");
|
||||
if (label != null) {
|
||||
out.print(" label=\"" + label + "\"");
|
||||
}
|
||||
if (style != null) {
|
||||
out.print(" style=\"" + style + "\"");
|
||||
}
|
||||
out.println("]");
|
||||
}
|
||||
|
||||
protected void writeNode(int name, String label) {
|
||||
out.print(" " + name);
|
||||
if (label != null) {
|
||||
out.print(" [label=\"" + label + "\"]");
|
||||
} else {
|
||||
out.print(" [shape=point color=white]");
|
||||
}
|
||||
out.println();
|
||||
}
|
||||
|
||||
private final static String FONT_NAME = "Helvetica";
|
||||
|
||||
/** Override to customize. */
|
||||
protected void writeHeader() {
|
||||
out.println("digraph tokens {");
|
||||
out.println(" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];");
|
||||
out.println(" // A2 paper size");
|
||||
out.println(" size = \"34.4,16.5\";");
|
||||
//out.println(" // try to fill paper");
|
||||
//out.println(" ratio = fill;");
|
||||
out.println(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]");
|
||||
out.println(" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]");
|
||||
out.println();
|
||||
}
|
||||
|
||||
/** Override to customize. */
|
||||
protected void writeTrailer() {
|
||||
out.println("}");
|
||||
}
|
||||
}
|
|
@ -112,6 +112,8 @@ public final class SynonymFilter extends TokenFilter {
|
|||
|
||||
private int captureCount;
|
||||
|
||||
// TODO: we should set PositionLengthAttr too...
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
|
|
@ -0,0 +1,180 @@
|
|||
package org.apache.lucene.analysis.kuromoji;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Position;
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Type;
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.WrappedPositionArray;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
|
||||
|
||||
|
||||
// TODO: would be nice to show 2nd best path in a diff't
|
||||
// color...
|
||||
|
||||
public class GraphvizFormatter {
|
||||
|
||||
private final static String BOS_LABEL = "BOS";
|
||||
|
||||
private final static String EOS_LABEL = "EOS";
|
||||
|
||||
private final static String FONT_NAME = "Helvetica";
|
||||
|
||||
private final ConnectionCosts costs;
|
||||
|
||||
private final Map<String, String> bestPathMap;
|
||||
|
||||
private final StringBuilder sb = new StringBuilder();
|
||||
|
||||
public GraphvizFormatter(ConnectionCosts costs) {
|
||||
this.costs = costs;
|
||||
this.bestPathMap = new HashMap<String, String>();
|
||||
sb.append(formatHeader());
|
||||
sb.append(" init [style=invis]\n");
|
||||
sb.append(" init -> 0.0 [label=\"" + BOS_LABEL + "\"]\n");
|
||||
}
|
||||
|
||||
public String finish() {
|
||||
sb.append(formatTrailer());
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
// Backtraces another incremental fragment:
|
||||
void onBacktrace(KuromojiTokenizer tok, WrappedPositionArray positions, int lastBackTracePos, Position endPosData, int fromIDX, char[] fragment, boolean isEnd) {
|
||||
setBestPathMap(positions, lastBackTracePos, endPosData, fromIDX);
|
||||
sb.append(formatNodes(tok, positions, lastBackTracePos, endPosData, fragment));
|
||||
if (isEnd) {
|
||||
sb.append(" fini [style=invis]\n");
|
||||
sb.append(" ");
|
||||
sb.append(getNodeID(endPosData.pos, fromIDX));
|
||||
sb.append(" -> fini [label=\"" + EOS_LABEL + "\"]");
|
||||
}
|
||||
}
|
||||
|
||||
// Records which arcs make up the best bath:
|
||||
private void setBestPathMap(WrappedPositionArray positions, int startPos, Position endPosData, int fromIDX) {
|
||||
bestPathMap.clear();
|
||||
|
||||
int pos = endPosData.pos;
|
||||
int bestIDX = fromIDX;
|
||||
while (pos > startPos) {
|
||||
final Position posData = positions.get(pos);
|
||||
|
||||
final int backPos = posData.backPos[bestIDX];
|
||||
final int backIDX = posData.backIndex[bestIDX];
|
||||
|
||||
final String toNodeID = getNodeID(pos, bestIDX);
|
||||
final String fromNodeID = getNodeID(backPos, backIDX);
|
||||
|
||||
assert !bestPathMap.containsKey(fromNodeID);
|
||||
assert !bestPathMap.containsValue(toNodeID);
|
||||
bestPathMap.put(fromNodeID, toNodeID);
|
||||
pos = backPos;
|
||||
bestIDX = backIDX;
|
||||
}
|
||||
}
|
||||
|
||||
private String formatNodes(KuromojiTokenizer tok, WrappedPositionArray positions, int startPos, Position endPosData, char[] fragment) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
// Output nodes
|
||||
for (int pos = startPos+1; pos <= endPosData.pos; pos++) {
|
||||
final Position posData = positions.get(pos);
|
||||
for(int idx=0;idx<posData.count;idx++) {
|
||||
sb.append(" ");
|
||||
sb.append(getNodeID(pos, idx));
|
||||
sb.append(" [label=\"");
|
||||
sb.append(pos);
|
||||
sb.append(": ");
|
||||
sb.append(posData.lastRightID[idx]);
|
||||
sb.append("\"]\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Output arcs
|
||||
for (int pos = endPosData.pos; pos > startPos; pos--) {
|
||||
final Position posData = positions.get(pos);
|
||||
for(int idx=0;idx<posData.count;idx++) {
|
||||
final Position backPosData = positions.get(posData.backPos[idx]);
|
||||
final String toNodeID = getNodeID(pos, idx);
|
||||
final String fromNodeID = getNodeID(posData.backPos[idx], posData.backIndex[idx]);
|
||||
|
||||
sb.append(" ");
|
||||
sb.append(fromNodeID);
|
||||
sb.append(" -> ");
|
||||
sb.append(toNodeID);
|
||||
|
||||
final String attrs;
|
||||
if (toNodeID.equals(bestPathMap.get(fromNodeID))) {
|
||||
// This arc is on best path
|
||||
attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20";
|
||||
} else {
|
||||
attrs = "";
|
||||
}
|
||||
|
||||
final Dictionary dict = tok.getDict(posData.backType[idx]);
|
||||
final int wordCost = dict.getWordCost(posData.backID[idx]);
|
||||
final int bgCost = costs.get(backPosData.lastRightID[posData.backIndex[idx]],
|
||||
dict.getLeftId(posData.backID[idx]));
|
||||
|
||||
final String surfaceForm = new String(fragment,
|
||||
posData.backPos[idx] - startPos,
|
||||
pos - posData.backPos[idx]);
|
||||
|
||||
sb.append(" [label=\"");
|
||||
sb.append(surfaceForm);
|
||||
sb.append(' ');
|
||||
sb.append(wordCost);
|
||||
if (bgCost >= 0) {
|
||||
sb.append('+');
|
||||
}
|
||||
sb.append(bgCost);
|
||||
sb.append("\"");
|
||||
sb.append(attrs);
|
||||
sb.append("]\n");
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String formatHeader() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("digraph viterbi {\n");
|
||||
sb.append(" graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
|
||||
//sb.append(" // A2 paper size\n");
|
||||
//sb.append(" size = \"34.4,16.5\";\n");
|
||||
//sb.append(" // try to fill paper\n");
|
||||
//sb.append(" ratio = fill;\n");
|
||||
sb.append(" edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
|
||||
sb.append(" node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String formatTrailer() {
|
||||
return "}";
|
||||
}
|
||||
|
||||
private String getNodeID(int pos, int idx) {
|
||||
return pos + "." + idx;
|
||||
}
|
||||
}
|
|
@ -27,21 +27,25 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
||||
private final Segmenter segmenter;
|
||||
private final Mode mode;
|
||||
private final Set<String> stoptags;
|
||||
private final UserDictionary userDict;
|
||||
|
||||
public KuromojiAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, new Segmenter(), DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
|
||||
this(matchVersion, null, KuromojiTokenizer.DEFAULT_MODE, DefaultSetHolder.DEFAULT_STOP_SET, DefaultSetHolder.DEFAULT_STOP_TAGS);
|
||||
}
|
||||
|
||||
public KuromojiAnalyzer(Version matchVersion, Segmenter segmenter, CharArraySet stopwords, Set<String> stoptags) {
|
||||
public KuromojiAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) {
|
||||
super(matchVersion, stopwords);
|
||||
this.segmenter = segmenter;
|
||||
this.userDict = userDict;
|
||||
this.mode = mode;
|
||||
this.stoptags = stoptags;
|
||||
}
|
||||
|
||||
|
@ -79,7 +83,7 @@ public class KuromojiAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(this.segmenter, reader);
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader, userDict, true, mode);
|
||||
TokenStream stream = new KuromojiBaseFormFilter(tokenizer);
|
||||
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
|
||||
stream = new CJKWidthFilter(stream);
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,214 +0,0 @@
|
|||
package org.apache.lucene.analysis.kuromoji;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
|
||||
import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
|
||||
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
|
||||
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
|
||||
|
||||
/**
|
||||
* Tokenizer main class.
|
||||
* Thread safe.
|
||||
*/
|
||||
public class Segmenter {
|
||||
public static enum Mode {
|
||||
NORMAL, SEARCH, EXTENDED
|
||||
}
|
||||
|
||||
public static final Mode DEFAULT_MODE = Mode.SEARCH;
|
||||
|
||||
private final Viterbi viterbi;
|
||||
|
||||
private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
|
||||
|
||||
private final boolean split;
|
||||
|
||||
public Segmenter() {
|
||||
this(null, DEFAULT_MODE, false);
|
||||
}
|
||||
|
||||
public Segmenter(Mode mode) {
|
||||
this(null, mode, false);
|
||||
}
|
||||
|
||||
public Segmenter(UserDictionary userDictionary) {
|
||||
this(userDictionary, DEFAULT_MODE, false);
|
||||
}
|
||||
|
||||
public Segmenter(UserDictionary userDictionary, Mode mode) {
|
||||
this(userDictionary, mode, false);
|
||||
}
|
||||
|
||||
public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) {
|
||||
final TokenInfoDictionary dict = TokenInfoDictionary.getInstance();
|
||||
final UnknownDictionary unknownDict = UnknownDictionary.getInstance();
|
||||
this.viterbi = new Viterbi(dict, unknownDict, ConnectionCosts.getInstance(), userDictionary, mode);
|
||||
this.split = split;
|
||||
|
||||
dictionaryMap.put(Type.KNOWN, dict);
|
||||
dictionaryMap.put(Type.UNKNOWN, unknownDict);
|
||||
dictionaryMap.put(Type.USER, userDictionary);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenize input text
|
||||
* @param text
|
||||
* @return list of Token
|
||||
*/
|
||||
public List<Token> tokenize(String text) {
|
||||
|
||||
if (!split) {
|
||||
return doTokenize(0, text);
|
||||
}
|
||||
|
||||
List<Integer> splitPositions = getSplitPositions(text);
|
||||
|
||||
if(splitPositions.size() == 0) {
|
||||
return doTokenize(0, text);
|
||||
}
|
||||
|
||||
ArrayList<Token> result = new ArrayList<Token>();
|
||||
int offset = 0;
|
||||
for(int position : splitPositions) {
|
||||
result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
|
||||
offset = position + 1;
|
||||
}
|
||||
|
||||
if(offset < text.length()) {
|
||||
result.addAll(doTokenize(offset, text.substring(offset)));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split input text at 句読点, which is 。 and 、
|
||||
* @param text
|
||||
* @return list of split position
|
||||
*/
|
||||
private List<Integer> getSplitPositions(String text) {
|
||||
ArrayList<Integer> splitPositions = new ArrayList<Integer>();
|
||||
|
||||
int position = 0;
|
||||
int currentPosition = 0;
|
||||
|
||||
while(true) {
|
||||
int indexOfMaru = text.indexOf("。", currentPosition);
|
||||
int indexOfTen = text.indexOf("、", currentPosition);
|
||||
|
||||
if(indexOfMaru < 0 || indexOfTen < 0) {
|
||||
position = Math.max(indexOfMaru, indexOfTen);;
|
||||
} else {
|
||||
position = Math.min(indexOfMaru, indexOfTen);
|
||||
}
|
||||
|
||||
if(position >= 0) {
|
||||
splitPositions.add(position);
|
||||
currentPosition = position + 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return splitPositions;
|
||||
}
|
||||
|
||||
private List<Token> doTokenize(int offset, String sentence) {
|
||||
char text[] = sentence.toCharArray();
|
||||
return doTokenize(offset, text, 0, text.length, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenize input sentence.
|
||||
* @param offset offset of sentence in original input text
|
||||
* @param sentence sentence to tokenize
|
||||
* @return list of Token
|
||||
*/
|
||||
public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
|
||||
ArrayList<Token> result = new ArrayList<Token>();
|
||||
|
||||
ViterbiNode[][][] lattice;
|
||||
try {
|
||||
lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
|
||||
} catch (IOException impossible) {
|
||||
throw new RuntimeException(impossible);
|
||||
}
|
||||
List<ViterbiNode> bestPath = viterbi.search(lattice);
|
||||
for (ViterbiNode node : bestPath) {
|
||||
int wordId = node.getWordId();
|
||||
if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS
|
||||
continue;
|
||||
} else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
|
||||
continue; // Do not emit punctuation
|
||||
}
|
||||
Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
|
||||
result.add(token);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** returns a Graphviz String */
|
||||
public String debugTokenize(String text) {
|
||||
ViterbiNode[][][] lattice;
|
||||
try {
|
||||
lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
|
||||
} catch (IOException impossible) {
|
||||
throw new RuntimeException(impossible);
|
||||
}
|
||||
List<ViterbiNode> bestPath = this.viterbi.search(lattice);
|
||||
|
||||
return new GraphvizFormatter(ConnectionCosts.getInstance())
|
||||
.format(lattice[0], lattice[1], bestPath);
|
||||
}
|
||||
|
||||
static final boolean isPunctuation(char ch) {
|
||||
switch(Character.getType(ch)) {
|
||||
case Character.SPACE_SEPARATOR:
|
||||
case Character.LINE_SEPARATOR:
|
||||
case Character.PARAGRAPH_SEPARATOR:
|
||||
case Character.CONTROL:
|
||||
case Character.FORMAT:
|
||||
case Character.DASH_PUNCTUATION:
|
||||
case Character.START_PUNCTUATION:
|
||||
case Character.END_PUNCTUATION:
|
||||
case Character.CONNECTOR_PUNCTUATION:
|
||||
case Character.OTHER_PUNCTUATION:
|
||||
case Character.MATH_SYMBOL:
|
||||
case Character.CURRENCY_SYMBOL:
|
||||
case Character.MODIFIER_SYMBOL:
|
||||
case Character.OTHER_SYMBOL:
|
||||
case Character.INITIAL_QUOTE_PUNCTUATION:
|
||||
case Character.FINAL_QUOTE_PUNCTUATION:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,8 +17,8 @@ package org.apache.lucene.analysis.kuromoji;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Type;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
|
||||
|
||||
public class Token {
|
||||
private final Dictionary dictionary;
|
||||
|
@ -30,6 +30,7 @@ public class Token {
|
|||
private final int length;
|
||||
|
||||
private final int position;
|
||||
private int positionLength;
|
||||
|
||||
private final Type type;
|
||||
|
||||
|
@ -40,8 +41,14 @@ public class Token {
|
|||
this.length = length;
|
||||
this.type = type;
|
||||
this.position = position;
|
||||
this.positionLength = positionLength;
|
||||
this.dictionary = dictionary;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Token(\"" + new String(surfaceForm, offset, length) + "\" pos=" + position + " type=" + type + " wordId=" + wordId + " leftID=" + dictionary.getLeftId(wordId) + ")";
|
||||
}
|
||||
|
||||
/**
|
||||
* @return surfaceForm
|
||||
|
@ -144,4 +151,21 @@ public class Token {
|
|||
public int getPosition() {
|
||||
return position;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the position length (in tokens) of this token. For normal
|
||||
* tokens this is 1; for compound tokens it's > 1.
|
||||
*/
|
||||
public void setPositionLength(int positionLength) {
|
||||
this.positionLength = positionLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the length (in tokens) of this token. For normal
|
||||
* tokens this is 1; for compound tokens it's > 1.
|
||||
* @return position length of token
|
||||
*/
|
||||
public int getPositionLength() {
|
||||
return positionLength;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.util.CSVUtil;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
|
@ -159,6 +160,10 @@ public final class UserDictionary implements Dictionary {
|
|||
return found ? toIndexArray(result) : EMPTY_RESULT;
|
||||
}
|
||||
|
||||
public TokenInfoFST getFST() {
|
||||
return fst;
|
||||
}
|
||||
|
||||
private static final int[][] EMPTY_RESULT = new int[0][];
|
||||
|
||||
/**
|
||||
|
@ -181,6 +186,10 @@ public final class UserDictionary implements Dictionary {
|
|||
}
|
||||
return result.toArray(new int[result.size()][]);
|
||||
}
|
||||
|
||||
public int[] lookupSegmentation(int phraseID) {
|
||||
return segmentations[phraseID];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getLeftId(int wordId) {
|
||||
|
|
|
@ -1,226 +0,0 @@
|
|||
package org.apache.lucene.analysis.kuromoji.viterbi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
|
||||
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
|
||||
|
||||
public class GraphvizFormatter {
|
||||
|
||||
private final static String BOS_LABEL = "BOS";
|
||||
|
||||
private final static String EOS_LABEL = "EOS";
|
||||
|
||||
private final static String FONT_NAME = "Helvetica";
|
||||
|
||||
private ConnectionCosts costs;
|
||||
|
||||
private Map<String, ViterbiNode> nodeMap;
|
||||
|
||||
private Map<String, String> bestPathMap;
|
||||
|
||||
private boolean foundBOS;
|
||||
|
||||
public GraphvizFormatter(ConnectionCosts costs) {
|
||||
this.costs = costs;
|
||||
this.nodeMap = new HashMap<String, ViterbiNode>();
|
||||
this.bestPathMap = new HashMap<String, String>();
|
||||
}
|
||||
|
||||
public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
|
||||
initBestPathMap(null);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(formatHeader());
|
||||
sb.append(formatNodes(startsArray, endsArray));
|
||||
sb.append(formatTrailer());
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List<ViterbiNode> bestPath) {
|
||||
|
||||
// List<ViterbiNode> bestPathWithBOSAndEOS = new ArrayList<ViterbiNode>(bastPath);
|
||||
initBestPathMap(bestPath);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(formatHeader());
|
||||
sb.append(formatNodes(startsArray, endsArray));
|
||||
sb.append(formatTrailer());
|
||||
return sb.toString();
|
||||
|
||||
}
|
||||
|
||||
private void initBestPathMap(List<ViterbiNode> bestPath) {
|
||||
this.bestPathMap.clear();
|
||||
|
||||
if (bestPath == null){
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < bestPath.size() - 1; i++) {
|
||||
ViterbiNode from = bestPath.get(i);
|
||||
ViterbiNode to = bestPath.get(i + 1);
|
||||
|
||||
String fromId = getNodeId(from);
|
||||
String toId = getNodeId(to);
|
||||
|
||||
assert this.bestPathMap.containsKey(fromId) == false;
|
||||
assert this.bestPathMap.containsValue(toId) == false;
|
||||
this.bestPathMap.put(fromId, toId);
|
||||
}
|
||||
}
|
||||
|
||||
private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
|
||||
this.nodeMap.clear();
|
||||
this.foundBOS = false;
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 1; i < endsArray.length; i++) {
|
||||
if(endsArray[i] == null || startsArray[i] == null) {
|
||||
continue;
|
||||
}
|
||||
for (int j = 0; j < endsArray[i].length; j++) {
|
||||
ViterbiNode from = endsArray[i][j];
|
||||
if(from == null){
|
||||
continue;
|
||||
}
|
||||
sb.append(formatNodeIfNew(from));
|
||||
for (int k = 0; k < startsArray[i].length; k++) {
|
||||
ViterbiNode to = startsArray[i][k];
|
||||
if(to == null){
|
||||
break;
|
||||
}
|
||||
sb.append(formatNodeIfNew(to));
|
||||
sb.append(formatEdge(from, to));
|
||||
}
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String formatNodeIfNew(ViterbiNode node) {
|
||||
String nodeId = getNodeId(node);
|
||||
if (! this.nodeMap.containsKey(nodeId)) {
|
||||
this.nodeMap.put(nodeId, node);
|
||||
return formatNode(node);
|
||||
} else {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private String formatHeader() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("digraph viterbi {\n");
|
||||
sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n");
|
||||
sb.append("# A2 paper size\n");
|
||||
sb.append("size = \"34.4,16.5\";\n");
|
||||
sb.append("# try to fill paper\n");
|
||||
sb.append("ratio = fill;\n");
|
||||
sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
|
||||
sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String formatTrailer() {
|
||||
return "}";
|
||||
}
|
||||
|
||||
|
||||
private String formatEdge(ViterbiNode from, ViterbiNode to) {
|
||||
if (this.bestPathMap.containsKey(getNodeId(from)) &&
|
||||
this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) {
|
||||
return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 ");
|
||||
|
||||
} else {
|
||||
return formatEdge(from, to, "");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(getNodeId(from));
|
||||
sb.append(" -> ");
|
||||
sb.append(getNodeId(to));
|
||||
sb.append(" [ ");
|
||||
sb.append("label=\"");
|
||||
sb.append(getCost(from, to));
|
||||
sb.append("\"");
|
||||
sb.append(" ");
|
||||
sb.append(attributes);
|
||||
sb.append(" ");
|
||||
sb.append(" ]");
|
||||
sb.append("\n");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String formatNode(ViterbiNode node) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("\"");
|
||||
sb.append(getNodeId(node));
|
||||
sb.append("\"");
|
||||
sb.append(" [ ");
|
||||
sb.append("label=");
|
||||
sb.append(formatNodeLabel(node));
|
||||
sb.append(" ]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String formatNodeLabel(ViterbiNode node) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("<<table border=\"0\" cellborder=\"0\">");
|
||||
sb.append("<tr><td>");
|
||||
sb.append(getNodeLabel(node));
|
||||
sb.append("</td></tr>");
|
||||
sb.append("<tr><td>");
|
||||
sb.append("<font color=\"blue\">");
|
||||
sb.append(node.getWordCost());
|
||||
sb.append("</font>");
|
||||
sb.append("</td></tr>");
|
||||
// sb.append("<tr><td>");
|
||||
// sb.append(this.dictionary.get(node.getWordId()).getPosInfo());
|
||||
// sb.append("</td></tr>");
|
||||
sb.append("</table>>");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String getNodeId(ViterbiNode node) {
|
||||
return String.valueOf(node.hashCode());
|
||||
}
|
||||
|
||||
private String getNodeLabel(ViterbiNode node) {
|
||||
if (node.getType() == Type.KNOWN && node.getWordId() == 0) {
|
||||
if (this.foundBOS) {
|
||||
return EOS_LABEL;
|
||||
} else {
|
||||
this.foundBOS = true;
|
||||
return BOS_LABEL;
|
||||
}
|
||||
} else {
|
||||
return node.getSurfaceFormString();
|
||||
}
|
||||
}
|
||||
|
||||
private int getCost(ViterbiNode from, ViterbiNode to) {
|
||||
return this.costs.get(from.getLeftId(), to.getRightId());
|
||||
}
|
||||
}
|
|
@ -1,365 +0,0 @@
|
|||
package org.apache.lucene.analysis.kuromoji.viterbi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
||||
public class Viterbi {
|
||||
|
||||
private final TokenInfoFST fst;
|
||||
|
||||
private final TokenInfoDictionary dictionary;
|
||||
|
||||
private final UnknownDictionary unkDictionary;
|
||||
|
||||
private final ConnectionCosts costs;
|
||||
|
||||
private final UserDictionary userDictionary;
|
||||
|
||||
private final CharacterDefinition characterDefinition;
|
||||
|
||||
private final boolean useUserDictionary;
|
||||
|
||||
private final boolean searchMode;
|
||||
|
||||
private final boolean extendedMode;
|
||||
|
||||
private static final int DEFAULT_COST = 10000000;
|
||||
|
||||
private static final int SEARCH_MODE_KANJI_LENGTH = 2;
|
||||
|
||||
private static final int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH
|
||||
|
||||
private static final int SEARCH_MODE_KANJI_PENALTY = 3000;
|
||||
|
||||
private static final int SEARCH_MODE_OTHER_PENALTY = 1700;
|
||||
|
||||
private static final char[] BOS = "BOS".toCharArray();
|
||||
|
||||
private static final char[] EOS = "EOS".toCharArray();
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*/
|
||||
public Viterbi(TokenInfoDictionary dictionary,
|
||||
UnknownDictionary unkDictionary,
|
||||
ConnectionCosts costs,
|
||||
UserDictionary userDictionary,
|
||||
Mode mode) {
|
||||
this.dictionary = dictionary;
|
||||
this.fst = dictionary.getFST();
|
||||
this.unkDictionary = unkDictionary;
|
||||
this.costs = costs;
|
||||
this.userDictionary = userDictionary;
|
||||
if(userDictionary == null) {
|
||||
this.useUserDictionary = false;
|
||||
} else {
|
||||
this.useUserDictionary = true;
|
||||
}
|
||||
|
||||
switch(mode){
|
||||
case SEARCH:
|
||||
searchMode = true;
|
||||
extendedMode = false;
|
||||
break;
|
||||
case EXTENDED:
|
||||
searchMode = true;
|
||||
extendedMode = true;
|
||||
break;
|
||||
default:
|
||||
searchMode = false;
|
||||
extendedMode = false;
|
||||
break;
|
||||
}
|
||||
|
||||
this.characterDefinition = unkDictionary.getCharacterDefinition();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find best path from input lattice.
|
||||
* @param lattice the result of build method
|
||||
* @return List of ViterbiNode which consist best path
|
||||
*/
|
||||
public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
|
||||
ViterbiNode[][] startIndexArr = lattice[0];
|
||||
ViterbiNode[][] endIndexArr = lattice[1];
|
||||
|
||||
for (int i = 1; i < startIndexArr.length; i++){
|
||||
|
||||
if (startIndexArr[i] == null || endIndexArr[i] == null){ // continue since no array which contains ViterbiNodes exists. Or no previous node exists.
|
||||
continue;
|
||||
}
|
||||
|
||||
for (ViterbiNode node : startIndexArr[i]) {
|
||||
if (node == null){ // If array doesn't contain ViterbiNode any more, continue to next index
|
||||
break;
|
||||
}
|
||||
|
||||
int backwardConnectionId = node.getLeftId();
|
||||
int wordCost = node.getWordCost();
|
||||
int leastPathCost = DEFAULT_COST;
|
||||
for (ViterbiNode leftNode : endIndexArr[i]) {
|
||||
if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
|
||||
break;
|
||||
}
|
||||
|
||||
int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost; // cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
|
||||
|
||||
// "Search mode". Add extra costs if it is long node.
|
||||
if (searchMode) {
|
||||
// System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
|
||||
char[] surfaceForm = node.getSurfaceForm();
|
||||
int offset = node.getOffset();
|
||||
int length = node.getLength();
|
||||
if (length > SEARCH_MODE_KANJI_LENGTH) {
|
||||
boolean allKanji = true;
|
||||
// check if node consists of only kanji
|
||||
for (int pos = 0; pos < length; pos++) {
|
||||
if (!characterDefinition.isKanji(surfaceForm[offset+pos])){
|
||||
allKanji = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (allKanji) { // Process only Kanji keywords
|
||||
pathCost += (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY;
|
||||
} else if (length > SEARCH_MODE_OTHER_LENGTH) {
|
||||
pathCost += (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (pathCost < leastPathCost){ // If total cost is lower than before, set current previous node as best left node (previous means left).
|
||||
leastPathCost = pathCost;
|
||||
node.setPathCost(leastPathCost);
|
||||
node.setLeftNode(leftNode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// track best path
|
||||
ViterbiNode node = endIndexArr[0][0]; // EOS
|
||||
LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
|
||||
result.add(node);
|
||||
while (true) {
|
||||
ViterbiNode leftNode = node.getLeftNode();
|
||||
if (leftNode == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
// EXTENDED mode convert unknown word into unigram node
|
||||
if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
|
||||
byte unigramWordId = CharacterDefinition.NGRAM;
|
||||
int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
|
||||
int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
|
||||
int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
|
||||
char[] surfaceForm = leftNode.getSurfaceForm();
|
||||
int offset = leftNode.getOffset();
|
||||
int length = leftNode.getLength();
|
||||
for (int i = length - 1; i >= 0; i--) {
|
||||
int charLen = 1;
|
||||
if (i > 0 && Character.isLowSurrogate(surfaceForm[offset+i])) {
|
||||
i--;
|
||||
charLen = 2;
|
||||
}
|
||||
ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm, offset + i, charLen, unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i, Type.UNKNOWN);
|
||||
result.addFirst(uniGramNode);
|
||||
}
|
||||
} else {
|
||||
result.addFirst(leftNode);
|
||||
}
|
||||
node = leftNode;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build lattice from input text
|
||||
* @param text
|
||||
*/
|
||||
public ViterbiNode[][][] build(char text[], int offset, int length) throws IOException {
|
||||
ViterbiNode[][] startIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS
|
||||
ViterbiNode[][] endIndexArr = new ViterbiNode[length + 2][]; // text length + BOS and EOS
|
||||
int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr
|
||||
int[] endSizeArr = new int[length + 2]; // array to keep ViterbiNode count in endIndexArr
|
||||
FST.Arc<Long> arc = new FST.Arc<Long>();
|
||||
ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
|
||||
addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
||||
|
||||
final FST.BytesReader fstReader = fst.getBytesReader(0);
|
||||
|
||||
// Process user dictionary;
|
||||
if (useUserDictionary) {
|
||||
processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
||||
}
|
||||
|
||||
int unknownWordEndIndex = -1; // index of the last character of unknown word
|
||||
|
||||
final IntsRef wordIdRef = new IntsRef();
|
||||
|
||||
for (int startIndex = 0; startIndex < length; startIndex++) {
|
||||
// If no token ends where current token starts, skip this index
|
||||
if (endSizeArr[startIndex + 1] == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int suffixStart = offset + startIndex;
|
||||
int suffixLength = length - startIndex;
|
||||
|
||||
boolean found = false;
|
||||
arc = fst.getFirstArc(arc);
|
||||
int output = 0;
|
||||
for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
|
||||
int ch = text[suffixStart + endIndex - 1];
|
||||
|
||||
if (fst.findTargetArc(ch, arc, arc, endIndex == 1, fstReader) == null) {
|
||||
break; // continue to next position
|
||||
}
|
||||
output += arc.output.intValue();
|
||||
|
||||
if (arc.isFinal()) {
|
||||
final int finalOutput = output + arc.nextFinalOutput.intValue();
|
||||
found = true; // Don't produce unknown word starting from this index
|
||||
dictionary.lookupWordIds(finalOutput, wordIdRef);
|
||||
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
|
||||
final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
|
||||
ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, endIndex, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
|
||||
addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// In the case of normal mode, it doesn't process unknown word greedily.
|
||||
if(!searchMode && unknownWordEndIndex > startIndex){
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process Unknown Word: hmm what is this isInvoke logic (same no matter what)
|
||||
int unknownWordLength = 0;
|
||||
char firstCharacter = text[suffixStart];
|
||||
boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
|
||||
if (isInvoke){ // Process "invoke"
|
||||
unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
|
||||
} else if (found == false){ // Process not "invoke"
|
||||
unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
|
||||
}
|
||||
|
||||
if (unknownWordLength > 0) { // found unknown word
|
||||
final int characterId = characterDefinition.getCharacterClass(firstCharacter);
|
||||
unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
|
||||
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
|
||||
final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
|
||||
ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, unknownWordLength, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
|
||||
addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
||||
}
|
||||
unknownWordEndIndex = startIndex + unknownWordLength;
|
||||
}
|
||||
}
|
||||
|
||||
ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
|
||||
addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
|
||||
|
||||
ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find token(s) in input text and set found token(s) in arrays as normal tokens
|
||||
* @param text
|
||||
* @param startIndexArr
|
||||
* @param endIndexArr
|
||||
* @param startSizeArr
|
||||
* @param endSizeArr
|
||||
*/
|
||||
private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) throws IOException {
|
||||
int[][] result = userDictionary.lookup(text, offset, len);
|
||||
for(int[] segmentation : result) {
|
||||
int wordId = segmentation[0];
|
||||
int index = segmentation[1];
|
||||
int length = segmentation[2];
|
||||
ViterbiNode node = new ViterbiNode(wordId, text, offset + index, length, userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
|
||||
addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add node to arrays and increment count in size array
|
||||
* @param node
|
||||
* @param startIndex
|
||||
* @param endIndex
|
||||
* @param startIndexArr
|
||||
* @param endIndexArr
|
||||
* @param startSizeArr
|
||||
* @param endSizeArr
|
||||
*/
|
||||
private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
|
||||
int startNodesCount = startSizeArr[startIndex];
|
||||
int endNodesCount = endSizeArr[endIndex];
|
||||
|
||||
if (startNodesCount == 0) {
|
||||
startIndexArr[startIndex] = new ViterbiNode[10];
|
||||
}
|
||||
|
||||
if (endNodesCount == 0) {
|
||||
endIndexArr[endIndex] = new ViterbiNode[10];
|
||||
}
|
||||
|
||||
if (startIndexArr[startIndex].length <= startNodesCount){
|
||||
startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
|
||||
}
|
||||
|
||||
if (endIndexArr[endIndex].length <= endNodesCount){
|
||||
endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
|
||||
}
|
||||
|
||||
startIndexArr[startIndex][startNodesCount] = node;
|
||||
endIndexArr[endIndex][endNodesCount] = node;
|
||||
|
||||
startSizeArr[startIndex] = startNodesCount + 1;
|
||||
endSizeArr[endIndex] = endNodesCount + 1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return twice as big array which contains value of input array
|
||||
* @param array
|
||||
* @return
|
||||
*/
|
||||
private ViterbiNode[] extendArray(ViterbiNode[] array) {
|
||||
//extend array
|
||||
ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
|
||||
System.arraycopy(array, 0, newArray, 0, array.length);
|
||||
return newArray;
|
||||
}
|
||||
}
|
|
@ -1,147 +0,0 @@
|
|||
package org.apache.lucene.analysis.kuromoji.viterbi;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
public final class ViterbiNode {
|
||||
public enum Type {
|
||||
KNOWN,
|
||||
UNKNOWN,
|
||||
USER
|
||||
}
|
||||
|
||||
private final int wordId;
|
||||
|
||||
private final char[] surfaceForm;
|
||||
private final int offset;
|
||||
private final int length;
|
||||
|
||||
private final int leftId;
|
||||
|
||||
private final int rightId;
|
||||
|
||||
/** word cost for this node */
|
||||
private final int wordCost;
|
||||
|
||||
/** minimum path cost found thus far */
|
||||
private int pathCost;
|
||||
|
||||
private ViterbiNode leftNode;
|
||||
|
||||
private final Type type;
|
||||
|
||||
private final int startIndex;
|
||||
|
||||
public ViterbiNode(int wordId, char[] surfaceForm, int offset, int length, int leftId, int rightId, int wordCost, int startIndex, Type type) {
|
||||
this.wordId = wordId;
|
||||
this.surfaceForm = surfaceForm;
|
||||
this.offset = offset;
|
||||
this.length = length;
|
||||
this.leftId = leftId;
|
||||
this.rightId = rightId;
|
||||
this.wordCost = wordCost;
|
||||
this.startIndex = startIndex;
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return the wordId
|
||||
*/
|
||||
public int getWordId() {
|
||||
return wordId;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the surfaceForm
|
||||
*/
|
||||
public char[] getSurfaceForm() {
|
||||
return surfaceForm;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return start offset into surfaceForm
|
||||
*/
|
||||
public int getOffset() {
|
||||
return offset;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return length of surfaceForm
|
||||
*/
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the surfaceForm as a String
|
||||
*/
|
||||
public String getSurfaceFormString() {
|
||||
return new String(surfaceForm, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the leftId
|
||||
*/
|
||||
public int getLeftId() {
|
||||
return leftId;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the rightId
|
||||
*/
|
||||
public int getRightId() {
|
||||
return rightId;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the cost
|
||||
*/
|
||||
public int getWordCost() {
|
||||
return wordCost;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the cost
|
||||
*/
|
||||
public int getPathCost() {
|
||||
return pathCost;
|
||||
}
|
||||
|
||||
/**
|
||||
* param cost minimum path cost found this far
|
||||
*/
|
||||
public void setPathCost(int pathCost) {
|
||||
this.pathCost = pathCost;
|
||||
}
|
||||
|
||||
public void setLeftNode(ViterbiNode node) {
|
||||
leftNode = node;
|
||||
}
|
||||
|
||||
public ViterbiNode getLeftNode() {
|
||||
return leftNode;
|
||||
}
|
||||
|
||||
public int getStartIndex() {
|
||||
return startIndex;
|
||||
}
|
||||
|
||||
public Type getType() {
|
||||
return type;
|
||||
}
|
||||
}
|
|
@ -1,231 +0,0 @@
|
|||
package org.apache.lucene.analysis.kuromoji;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
public class SegmenterTest extends LuceneTestCase {
|
||||
|
||||
private static Segmenter segmenter;
|
||||
|
||||
@BeforeClass
|
||||
public static void setUpBeforeClass() throws Exception {
|
||||
segmenter = new Segmenter();
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() throws Exception {
|
||||
segmenter = null;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSegmentation() {
|
||||
// Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
|
||||
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
|
||||
// String[] surfaceForms = {
|
||||
// "ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
|
||||
// "スペース", "ステーション", "に", "行き", "ます", "。",
|
||||
// "うたがわしい", "。"
|
||||
// };
|
||||
String input = "スペースステーションに行きます。うたがわしい。";
|
||||
String[] surfaceForms = {
|
||||
"スペース", "ステーション", "に", "行き", "ます", "。",
|
||||
"うたがわしい", "。"
|
||||
};
|
||||
List<Token> tokens = segmenter.tokenize(input);
|
||||
assertTrue(tokens.size() == surfaceForms.length);
|
||||
for (int i = 0; i < tokens.size(); i++) {
|
||||
assertEquals(surfaceForms[i], tokens.get(i).getSurfaceFormString());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadings() {
|
||||
List<Token> tokens = segmenter.tokenize("寿司が食べたいです。");
|
||||
assertEquals(6, tokens.size());
|
||||
assertEquals("スシ", tokens.get(0).getReading());
|
||||
assertEquals("ガ", tokens.get(1).getReading());
|
||||
assertEquals("タベ", tokens.get(2).getReading());
|
||||
assertEquals("タイ", tokens.get(3).getReading());
|
||||
assertEquals("デス", tokens.get(4).getReading());
|
||||
assertEquals("。", tokens.get(5).getReading());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadings2() {
|
||||
List<Token> tokens = segmenter.tokenize("多くの学生が試験に落ちた。");
|
||||
assertEquals(9, tokens.size());
|
||||
assertEquals("オオク", tokens.get(0).getReading());
|
||||
assertEquals("ノ", tokens.get(1).getReading());
|
||||
assertEquals("ガクセイ", tokens.get(2).getReading());
|
||||
assertEquals("ガ", tokens.get(3).getReading());
|
||||
assertEquals("シケン", tokens.get(4).getReading());
|
||||
assertEquals("ニ", tokens.get(5).getReading());
|
||||
assertEquals("オチ", tokens.get(6).getReading());
|
||||
assertEquals("タ", tokens.get(7).getReading());
|
||||
assertEquals("。", tokens.get(8).getReading());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPronunciations() {
|
||||
List<Token> tokens = segmenter.tokenize("寿司が食べたいです。");
|
||||
assertEquals(6, tokens.size());
|
||||
assertEquals("スシ", tokens.get(0).getPronunciation());
|
||||
assertEquals("ガ", tokens.get(1).getPronunciation());
|
||||
assertEquals("タベ", tokens.get(2).getPronunciation());
|
||||
assertEquals("タイ", tokens.get(3).getPronunciation());
|
||||
assertEquals("デス", tokens.get(4).getPronunciation());
|
||||
assertEquals("。", tokens.get(5).getPronunciation());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPronunciations2() {
|
||||
List<Token> tokens = segmenter.tokenize("多くの学生が試験に落ちた。");
|
||||
assertEquals(9, tokens.size());
|
||||
// pronunciation differs from reading here
|
||||
assertEquals("オーク", tokens.get(0).getPronunciation());
|
||||
assertEquals("ノ", tokens.get(1).getPronunciation());
|
||||
assertEquals("ガクセイ", tokens.get(2).getPronunciation());
|
||||
assertEquals("ガ", tokens.get(3).getPronunciation());
|
||||
assertEquals("シケン", tokens.get(4).getPronunciation());
|
||||
assertEquals("ニ", tokens.get(5).getPronunciation());
|
||||
assertEquals("オチ", tokens.get(6).getPronunciation());
|
||||
assertEquals("タ", tokens.get(7).getPronunciation());
|
||||
assertEquals("。", tokens.get(8).getPronunciation());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicForms() {
|
||||
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
|
||||
assertEquals(9, tokens.size());
|
||||
assertNull(tokens.get(0).getBaseForm());
|
||||
assertNull(tokens.get(1).getBaseForm());
|
||||
assertNull(tokens.get(2).getBaseForm());
|
||||
assertNull(tokens.get(3).getBaseForm());
|
||||
assertNull(tokens.get(4).getBaseForm());
|
||||
assertNull(tokens.get(5).getBaseForm());
|
||||
assertEquals(tokens.get(6).getBaseForm(), "ある");
|
||||
assertNull(tokens.get(7).getBaseForm());
|
||||
assertNull(tokens.get(8).getBaseForm());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInflectionTypes() {
|
||||
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
|
||||
assertEquals(9, tokens.size());
|
||||
assertNull(tokens.get(0).getInflectionType());
|
||||
assertNull(tokens.get(1).getInflectionType());
|
||||
assertNull(tokens.get(2).getInflectionType());
|
||||
assertNull(tokens.get(3).getInflectionType());
|
||||
assertNull(tokens.get(4).getInflectionType());
|
||||
assertNull(tokens.get(5).getInflectionType());
|
||||
assertEquals("五段・ラ行", tokens.get(6).getInflectionType());
|
||||
assertEquals("特殊・マス", tokens.get(7).getInflectionType());
|
||||
assertNull(tokens.get(8).getInflectionType());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInflectionForms() {
|
||||
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
|
||||
assertEquals(9, tokens.size());
|
||||
assertNull(tokens.get(0).getInflectionForm());
|
||||
assertNull(tokens.get(1).getInflectionForm());
|
||||
assertNull(tokens.get(2).getInflectionForm());
|
||||
assertNull(tokens.get(3).getInflectionForm());
|
||||
assertNull(tokens.get(4).getInflectionForm());
|
||||
assertNull(tokens.get(5).getInflectionForm());
|
||||
assertEquals("連用形", tokens.get(6).getInflectionForm());
|
||||
assertEquals("基本形", tokens.get(7).getInflectionForm());
|
||||
assertNull(tokens.get(8).getInflectionForm());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPartOfSpeech() {
|
||||
List<Token> tokens = segmenter.tokenize("それはまだ実験段階にあります。");
|
||||
assertEquals(9, tokens.size());
|
||||
assertEquals("名詞-代名詞-一般", tokens.get(0).getPartOfSpeech());
|
||||
assertEquals("助詞-係助詞", tokens.get(1).getPartOfSpeech());
|
||||
assertEquals("副詞-助詞類接続", tokens.get(2).getPartOfSpeech());
|
||||
assertEquals("名詞-サ変接続", tokens.get(3).getPartOfSpeech());
|
||||
assertEquals("名詞-一般", tokens.get(4).getPartOfSpeech());
|
||||
assertEquals("助詞-格助詞-一般", tokens.get(5).getPartOfSpeech());
|
||||
assertEquals("動詞-自立", tokens.get(6).getPartOfSpeech());
|
||||
assertEquals("助動詞", tokens.get(7).getPartOfSpeech());
|
||||
assertEquals("記号-句点", tokens.get(8).getPartOfSpeech());
|
||||
}
|
||||
|
||||
// TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
|
||||
// do we have a possibility to actually lookup the first and last word from dictionary?
|
||||
public void testYabottai() {
|
||||
List<Token> tokens = segmenter.tokenize("やぼったい");
|
||||
assertEquals(1, tokens.size());
|
||||
assertEquals("やぼったい", tokens.get(0).getSurfaceFormString());
|
||||
}
|
||||
|
||||
public void testTsukitosha() {
|
||||
List<Token> tokens = segmenter.tokenize("突き通しゃ");
|
||||
assertEquals(1, tokens.size());
|
||||
assertEquals("突き通しゃ", tokens.get(0).getSurfaceFormString());
|
||||
}
|
||||
|
||||
public void testBocchan() throws Exception {
|
||||
doTestBocchan(1);
|
||||
}
|
||||
|
||||
@Test @Nightly
|
||||
public void testBocchanBig() throws Exception {
|
||||
doTestBocchan(100);
|
||||
}
|
||||
|
||||
private void doTestBocchan(int numIterations) throws Exception {
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
|
||||
this.getClass().getResourceAsStream("bocchan.utf-8")));
|
||||
|
||||
String line = reader.readLine();
|
||||
reader.close();
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("Test for Bocchan without pre-splitting sentences");
|
||||
}
|
||||
long totalStart = System.currentTimeMillis();
|
||||
for (int i = 0; i < numIterations; i++){
|
||||
segmenter.tokenize(line);
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
|
||||
System.out.println("Test for Bocchan with pre-splitting sentences");
|
||||
}
|
||||
String[] sentences = line.split("、|。");
|
||||
totalStart = System.currentTimeMillis();
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
for (String sentence: sentences) {
|
||||
segmenter.tokenize(sentence);
|
||||
}
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -25,18 +25,17 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestExtendedMode extends BaseTokenStreamTestCase {
|
||||
private final Segmenter segmenter = new Segmenter(Mode.EXTENDED);
|
||||
private final Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.EXTENDED);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -18,8 +18,11 @@ package org.apache.lucene.analysis.kuromoji;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
|
||||
|
||||
public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the
|
||||
|
@ -41,20 +44,103 @@ public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
|
|||
new int[] { 1, 2, 2, 2 }
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test that search mode is enabled and working by default
|
||||
*/
|
||||
public void testDecomposition() throws IOException {
|
||||
assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "シニアソフトウェアエンジニア",
|
||||
new String[] { "シニア", "ソフトウェア", "エンジニア" }
|
||||
);
|
||||
|
||||
final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, null, Mode.SEARCH,
|
||||
KuromojiAnalyzer.getDefaultStopSet(),
|
||||
KuromojiAnalyzer.getDefaultStopTags());
|
||||
|
||||
/*
|
||||
//TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
|
||||
TokenStream ts = a.tokenStream("foo", new StringReader("�<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
|
||||
ts.reset();
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
while(ts.incrementToken()) {
|
||||
System.out.println(" " + termAtt.toString());
|
||||
}
|
||||
System.out.println("DONE PARSE\n\n");
|
||||
*/
|
||||
|
||||
// Senior software engineer:
|
||||
assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア",
|
||||
new String[] { "シニア",
|
||||
"シニアソフトウェアエンジニア",
|
||||
"ソフトウェア",
|
||||
"エンジニア" },
|
||||
new int[] { 1, 0, 1, 1},
|
||||
new int[] { 1, 3, 1, 1}
|
||||
);
|
||||
|
||||
// Kansai International Airport:
|
||||
assertAnalyzesToPositions(a, "関西国際空港",
|
||||
new String[] { "関西",
|
||||
"関西国際空港", // zero pos inc
|
||||
"国際",
|
||||
"空港" },
|
||||
new int[] {1, 0, 1, 1},
|
||||
new int[] {1, 3, 1, 1}
|
||||
);
|
||||
|
||||
// Konika Minolta Holdings; not quite the right
|
||||
// segmentation (see LUCENE-3726):
|
||||
assertAnalyzesToPositions(a, "コニカミノルタホールディングス",
|
||||
new String[] { "コニカ",
|
||||
"コニカミノルタホールディングス", // zero pos inc
|
||||
"ミノルタ",
|
||||
"ホールディングス"},
|
||||
new int[] {1, 0, 1, 1},
|
||||
new int[] {1, 3, 1, 1}
|
||||
);
|
||||
|
||||
// Narita Airport
|
||||
assertAnalyzesToPositions(a, "成田空港",
|
||||
new String[] { "成田",
|
||||
"成田空港",
|
||||
"空港" },
|
||||
new int[] {1, 0, 1},
|
||||
new int[] {1, 2, 1}
|
||||
);
|
||||
|
||||
// Kyoto University Baseball Club
|
||||
assertAnalyzesToPositions(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "京都大学硬式野球部",
|
||||
new String[] { "京都大",
|
||||
"学",
|
||||
"硬式",
|
||||
"野球",
|
||||
"部" },
|
||||
new int[] {1, 1, 1, 1, 1},
|
||||
new int[] {1, 1, 1, 1, 1});
|
||||
// toDotFile(a, "成田空港", "/mnt/scratch/out.dot");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* blast random strings against the analyzer
|
||||
*/
|
||||
public void testRandom() throws IOException {
|
||||
checkRandomData(random, new KuromojiAnalyzer(TEST_VERSION_CURRENT), atLeast(10000));
|
||||
final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, null, Mode.SEARCH,
|
||||
KuromojiAnalyzer.getDefaultStopSet(),
|
||||
KuromojiAnalyzer.getDefaultStopTags());
|
||||
checkRandomData(random, a, atLeast(10000));
|
||||
}
|
||||
|
||||
// Copied from TestKuromojiTokenizer, to make sure passing
|
||||
// user dict to analyzer works:
|
||||
public void testUserDict3() throws Exception {
|
||||
// Test entry that breaks into multiple tokens:
|
||||
final Analyzer a = new KuromojiAnalyzer(TEST_VERSION_CURRENT, TestKuromojiTokenizer.readDict(),
|
||||
Mode.SEARCH,
|
||||
KuromojiAnalyzer.getDefaultStopSet(),
|
||||
KuromojiAnalyzer.getDefaultStopTags());
|
||||
assertTokenStreamContents(a.tokenStream("foo", new StringReader("abcd")),
|
||||
new String[] { "a", "b", "cd" },
|
||||
new int[] { 0, 1, 2 },
|
||||
new int[] { 1, 2, 4 },
|
||||
new Integer(4)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ public class TestKuromojiBaseFormFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader);
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.DEFAULT_MODE);
|
||||
return new TokenStreamComponents(tokenizer, new KuromojiBaseFormFilter(tokenizer));
|
||||
}
|
||||
};
|
||||
|
|
|
@ -17,7 +17,13 @@ package org.apache.lucene.analysis.kuromoji;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
|
@ -25,21 +31,76 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.tokenattributes.*;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public static UserDictionary readDict() {
|
||||
InputStream is = TestKuromojiTokenizer.class.getResourceAsStream("userdict.txt");
|
||||
if (is == null) {
|
||||
throw new RuntimeException("Cannot find userdict.txt in test classpath!");
|
||||
}
|
||||
try {
|
||||
try {
|
||||
Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
|
||||
return new UserDictionary(reader);
|
||||
} finally {
|
||||
is.close();
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
}
|
||||
|
||||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader);
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
private Analyzer analyzerNormal = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.NORMAL);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
private Analyzer analyzerNoPunct = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
private Analyzer extendedModeAnalyzerNoPunct = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), true, Mode.EXTENDED);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
public void testNormalMode() throws Exception {
|
||||
assertAnalyzesTo(analyzerNormal,
|
||||
"シニアソフトウェアエンジニア",
|
||||
new String[] {"シニアソフトウェアエンジニア"});
|
||||
}
|
||||
|
||||
public void testDecomposition1() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
|
||||
assertAnalyzesTo(analyzerNoPunct, "本来は、貧困層の女性や子供に医療保護を提供するために創設された制度である、" +
|
||||
"アメリカ低所得者医療援助制度が、今日では、その予算の約3分の1を老人に費やしている。",
|
||||
new String[] { "本来", "は", "貧困", "層", "の", "女性", "や", "子供", "に", "医療", "保護", "を",
|
||||
"提供", "する", "ため", "に", "創設", "さ", "れ", "た", "制度", "で", "ある", "アメリカ",
|
||||
|
@ -55,7 +116,7 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testDecomposition2() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "麻薬の密売は根こそぎ絶やさなければならない",
|
||||
assertAnalyzesTo(analyzerNoPunct, "麻薬の密売は根こそぎ絶やさなければならない",
|
||||
new String[] { "麻薬", "の", "密売", "は", "根こそぎ", "絶やさ", "なけれ", "ば", "なら", "ない" },
|
||||
new int[] { 0, 2, 3, 5, 6, 10, 13, 16, 17, 19 },
|
||||
new int[] { 2, 3, 5, 6, 10, 13, 16, 17, 19, 21 }
|
||||
|
@ -63,7 +124,7 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testDecomposition3() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。",
|
||||
assertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。",
|
||||
new String[] { "魔女", "狩", "大将", "マシュー", "ホプキンス" },
|
||||
new int[] { 0, 2, 3, 5, 10 },
|
||||
new int[] { 2, 3, 5, 9, 15 }
|
||||
|
@ -91,9 +152,32 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
|
|||
ts.close();
|
||||
}
|
||||
|
||||
/*
|
||||
// NOTE: intentionally fails! Just trying to debug this
|
||||
// one input...
|
||||
public void testDecomposition6() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "奈良先端科学技術大学院大学",
|
||||
new String[] { "これ", "は", "本", "で", "は", "ない" },
|
||||
new int[] { 0, 2, 3, 4, 5, 6 },
|
||||
new int[] { 2, 3, 4, 5, 6, 8 }
|
||||
);
|
||||
}
|
||||
*/
|
||||
|
||||
/** Tests that sentence offset is incorporated into the resulting offsets */
|
||||
public void testTwoSentences() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
|
||||
/*
|
||||
//TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
|
||||
TokenStream ts = analyzer.tokenStream("foo", new StringReader("�<!--\"<!--#<!--;?><!--#<!--#><!---->?>-->;"));
|
||||
ts.reset();
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
while(ts.incrementToken()) {
|
||||
System.out.println(" " + termAtt.toString());
|
||||
}
|
||||
System.out.println("DONE PARSE\n\n");
|
||||
*/
|
||||
|
||||
assertAnalyzesTo(analyzerNoPunct, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。",
|
||||
new String[] { "魔女", "狩", "大将", "マシュー", "ホプキンス", "魔女", "狩", "大将", "マシュー", "ホプキンス" },
|
||||
new int[] { 0, 2, 3, 5, 10, 17, 19, 20, 22, 27 },
|
||||
new int[] { 2, 3, 5, 9, 15, 19, 20, 22, 26, 32 }
|
||||
|
@ -103,6 +187,7 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
|
|||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, analyzerNoPunct, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testLargeDocReliability() throws Exception {
|
||||
|
@ -125,6 +210,9 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
|
|||
public void testSurrogates2() throws IOException {
|
||||
int numIterations = atLeast(10000);
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("\nTEST: iter=" + i);
|
||||
}
|
||||
String s = _TestUtil.randomUnicodeString(random, 100);
|
||||
TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
@ -134,22 +222,410 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testOnlyPunctuation() throws IOException {
|
||||
TokenStream ts = analyzerNoPunct.tokenStream("foo", new StringReader("。、。。"));
|
||||
ts.reset();
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
public void testOnlyPunctuationExtended() throws IOException {
|
||||
TokenStream ts = extendedModeAnalyzerNoPunct.tokenStream("foo", new StringReader("......"));
|
||||
ts.reset();
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
// note: test is kinda silly since kuromoji emits punctuation tokens.
|
||||
// but, when/if we filter these out it will be useful.
|
||||
public void testEnd() throws Exception {
|
||||
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない")),
|
||||
assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない")),
|
||||
new String[] { "これ", "は", "本", "で", "は", "ない" },
|
||||
new int[] { 0, 2, 3, 4, 5, 6 },
|
||||
new int[] { 2, 3, 4, 5, 6, 8 },
|
||||
new Integer(8)
|
||||
);
|
||||
|
||||
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("これは本ではない ")),
|
||||
|
||||
assertTokenStreamContents(analyzerNoPunct.tokenStream("foo", new StringReader("これは本ではない ")),
|
||||
new String[] { "これ", "は", "本", "で", "は", "ない" },
|
||||
new int[] { 0, 2, 3, 4, 5, 6, 8 },
|
||||
new int[] { 2, 3, 4, 5, 6, 8, 9 },
|
||||
new Integer(12)
|
||||
);
|
||||
}
|
||||
|
||||
public void testUserDict() throws Exception {
|
||||
// Not a great test because w/o userdict.txt the
|
||||
// segmentation is the same:
|
||||
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("関西国際空港に行った")),
|
||||
new String[] { "関西", "国際", "空港", "に", "行っ", "た" },
|
||||
new int[] { 0, 2, 4, 6, 7, 9 },
|
||||
new int[] { 2, 4, 6, 7, 9, 10 },
|
||||
new Integer(10)
|
||||
);
|
||||
}
|
||||
|
||||
public void testUserDict2() throws Exception {
|
||||
// Better test: w/o userdict the segmentation is different:
|
||||
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("朝青龍")),
|
||||
new String[] { "朝青龍" },
|
||||
new int[] { 0 },
|
||||
new int[] { 3 },
|
||||
new Integer(3)
|
||||
);
|
||||
}
|
||||
|
||||
public void testUserDict3() throws Exception {
|
||||
// Test entry that breaks into multiple tokens:
|
||||
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcd")),
|
||||
new String[] { "a", "b", "cd" },
|
||||
new int[] { 0, 1, 2 },
|
||||
new int[] { 1, 2, 4 },
|
||||
new Integer(4)
|
||||
);
|
||||
}
|
||||
|
||||
// HMM: fails (segments as a/b/cd/efghij)... because the
|
||||
// two paths have exactly equal paths (1 KNOWN + 1
|
||||
// UNKNOWN) and we don't seem to favor longer KNOWN /
|
||||
// shorter UNKNOWN matches:
|
||||
|
||||
/*
|
||||
public void testUserDict4() throws Exception {
|
||||
// Test entry that has another entry as prefix
|
||||
assertTokenStreamContents(analyzer.tokenStream("foo", new StringReader("abcdefghij")),
|
||||
new String[] { "ab", "cd", "efg", "hij" },
|
||||
new int[] { 0, 2, 4, 7 },
|
||||
new int[] { 2, 4, 7, 10 },
|
||||
new Integer(10)
|
||||
);
|
||||
}
|
||||
*/
|
||||
|
||||
public void testSegmentation() throws Exception {
|
||||
// Skip tests for Michelle Kwan -- UniDic segments Kwan as ク ワン
|
||||
// String input = "ミシェル・クワンが優勝しました。スペースステーションに行きます。うたがわしい。";
|
||||
// String[] surfaceForms = {
|
||||
// "ミシェル", "・", "クワン", "が", "優勝", "し", "まし", "た", "。",
|
||||
// "スペース", "ステーション", "に", "行き", "ます", "。",
|
||||
// "うたがわしい", "。"
|
||||
// };
|
||||
String input = "スペースステーションに行きます。うたがわしい。";
|
||||
String[] surfaceForms = {
|
||||
"スペース", "ステーション", "に", "行き", "ます", "。",
|
||||
"うたがわしい", "。"
|
||||
};
|
||||
assertAnalyzesTo(analyzer,
|
||||
input,
|
||||
surfaceForms);
|
||||
}
|
||||
|
||||
public void testLatticeToDot() throws Exception {
|
||||
final GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.getInstance());
|
||||
final Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
KuromojiTokenizer tokenizer = new KuromojiTokenizer(reader, readDict(), false, Mode.SEARCH);
|
||||
tokenizer.setGraphvizFormatter(gv2);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
String input = "スペースステーションに行きます。うたがわしい。";
|
||||
String[] surfaceForms = {
|
||||
"スペース", "ステーション", "に", "行き", "ます", "。",
|
||||
"うたがわしい", "。"
|
||||
};
|
||||
assertAnalyzesTo(analyzer,
|
||||
input,
|
||||
surfaceForms);
|
||||
|
||||
assertTrue(gv2.finish().indexOf("22.0") != -1);
|
||||
}
|
||||
|
||||
private void assertReadings(String input, String... readings) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
|
||||
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
||||
ts.reset();
|
||||
for(String reading : readings) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(reading, readingAtt.getReading());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
private void assertPronunciations(String input, String... pronunciations) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
|
||||
ReadingAttribute readingAtt = ts.addAttribute(ReadingAttribute.class);
|
||||
ts.reset();
|
||||
for(String pronunciation : pronunciations) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(pronunciation, readingAtt.getPronunciation());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
private void assertBaseForms(String input, String... baseForms) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
|
||||
BaseFormAttribute baseFormAtt = ts.addAttribute(BaseFormAttribute.class);
|
||||
ts.reset();
|
||||
for(String baseForm : baseForms) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(baseForm, baseFormAtt.getBaseForm());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
private void assertInflectionTypes(String input, String... inflectionTypes) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
|
||||
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
|
||||
ts.reset();
|
||||
for(String inflectionType : inflectionTypes) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(inflectionType, inflectionAtt.getInflectionType());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
private void assertInflectionForms(String input, String... inflectionForms) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
|
||||
InflectionAttribute inflectionAtt = ts.addAttribute(InflectionAttribute.class);
|
||||
ts.reset();
|
||||
for(String inflectionForm : inflectionForms) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(inflectionForm, inflectionAtt.getInflectionForm());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
private void assertPartsOfSpeech(String input, String... partsOfSpeech) throws IOException {
|
||||
TokenStream ts = analyzer.tokenStream("ignored", new StringReader(input));
|
||||
PartOfSpeechAttribute partOfSpeechAtt = ts.addAttribute(PartOfSpeechAttribute.class);
|
||||
ts.reset();
|
||||
for(String partOfSpeech : partsOfSpeech) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(partOfSpeech, partOfSpeechAtt.getPartOfSpeech());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
}
|
||||
|
||||
public void testReadings() throws Exception {
|
||||
assertReadings("寿司が食べたいです。",
|
||||
"スシ",
|
||||
"ガ",
|
||||
"タベ",
|
||||
"タイ",
|
||||
"デス",
|
||||
"。");
|
||||
}
|
||||
|
||||
public void testReadings2() throws Exception {
|
||||
assertReadings("多くの学生が試験に落ちた。",
|
||||
"オオク",
|
||||
"ノ",
|
||||
"ガクセイ",
|
||||
"ガ",
|
||||
"シケン",
|
||||
"ニ",
|
||||
"オチ",
|
||||
"タ",
|
||||
"。");
|
||||
}
|
||||
|
||||
public void testPronunciations() throws Exception {
|
||||
assertPronunciations("寿司が食べたいです。",
|
||||
"スシ",
|
||||
"ガ",
|
||||
"タベ",
|
||||
"タイ",
|
||||
"デス",
|
||||
"。");
|
||||
}
|
||||
|
||||
public void testPronunciations2() throws Exception {
|
||||
// pronunciation differs from reading here
|
||||
assertPronunciations("多くの学生が試験に落ちた。",
|
||||
"オーク",
|
||||
"ノ",
|
||||
"ガクセイ",
|
||||
"ガ",
|
||||
"シケン",
|
||||
"ニ",
|
||||
"オチ",
|
||||
"タ",
|
||||
"。");
|
||||
}
|
||||
|
||||
public void testBasicForms() throws Exception {
|
||||
assertBaseForms("それはまだ実験段階にあります。",
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
"ある",
|
||||
null,
|
||||
null);
|
||||
}
|
||||
|
||||
public void testInflectionTypes() throws Exception {
|
||||
assertInflectionTypes("それはまだ実験段階にあります。",
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
"五段・ラ行",
|
||||
"特殊・マス",
|
||||
null);
|
||||
}
|
||||
|
||||
public void testInflectionForms() throws Exception {
|
||||
assertInflectionForms("それはまだ実験段階にあります。",
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
"連用形",
|
||||
"基本形",
|
||||
null);
|
||||
}
|
||||
|
||||
public void testPartOfSpeech() throws Exception {
|
||||
assertPartsOfSpeech("それはまだ実験段階にあります。",
|
||||
"名詞-代名詞-一般",
|
||||
"助詞-係助詞",
|
||||
"副詞-助詞類接続",
|
||||
"名詞-サ変接続",
|
||||
"名詞-一般",
|
||||
"助詞-格助詞-一般",
|
||||
"動詞-自立",
|
||||
"助動詞",
|
||||
"記号-句点");
|
||||
}
|
||||
|
||||
// TODO: the next 2 tests are no longer using the first/last word ids, maybe lookup the words and fix?
|
||||
// do we have a possibility to actually lookup the first and last word from dictionary?
|
||||
public void testYabottai() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "やぼったい",
|
||||
new String[] {"やぼったい"});
|
||||
}
|
||||
|
||||
public void testTsukitosha() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "突き通しゃ",
|
||||
new String[] {"突き通しゃ"});
|
||||
}
|
||||
|
||||
public void testBocchan() throws Exception {
|
||||
doTestBocchan(1);
|
||||
}
|
||||
|
||||
@Nightly
|
||||
public void testBocchanBig() throws Exception {
|
||||
doTestBocchan(100);
|
||||
}
|
||||
|
||||
/*
|
||||
public void testWikipedia() throws Exception {
|
||||
final FileInputStream fis = new FileInputStream("/q/lucene/jawiki-20120220-pages-articles.xml");
|
||||
final Reader r = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
|
||||
|
||||
final long startTimeNS = System.nanoTime();
|
||||
boolean done = false;
|
||||
long compoundCount = 0;
|
||||
long nonCompoundCount = 0;
|
||||
long netOffset = 0;
|
||||
while (!done) {
|
||||
final TokenStream ts = analyzer.tokenStream("ignored", r);
|
||||
ts.reset();
|
||||
final PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
int count = 0;
|
||||
while (true) {
|
||||
if (!ts.incrementToken()) {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
count++;
|
||||
if (posIncAtt.getPositionIncrement() == 0) {
|
||||
compoundCount++;
|
||||
} else {
|
||||
nonCompoundCount++;
|
||||
if (nonCompoundCount % 1000000 == 0) {
|
||||
System.out.println(String.format("%.2f msec [pos=%d, %d, %d]",
|
||||
(System.nanoTime()-startTimeNS)/1000000.0,
|
||||
netOffset + offsetAtt.startOffset(),
|
||||
nonCompoundCount,
|
||||
compoundCount));
|
||||
}
|
||||
}
|
||||
if (count == 100000000) {
|
||||
System.out.println(" again...");
|
||||
break;
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
netOffset += offsetAtt.endOffset();
|
||||
}
|
||||
System.out.println("compoundCount=" + compoundCount + " nonCompoundCount=" + nonCompoundCount);
|
||||
r.close();
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
private void doTestBocchan(int numIterations) throws Exception {
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(
|
||||
this.getClass().getResourceAsStream("bocchan.utf-8")));
|
||||
String line = reader.readLine();
|
||||
reader.close();
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("Test for Bocchan without pre-splitting sentences");
|
||||
}
|
||||
|
||||
/*
|
||||
if (numIterations > 1) {
|
||||
// warmup
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
|
||||
ts.reset();
|
||||
while(ts.incrementToken());
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
long totalStart = System.currentTimeMillis();
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(line));
|
||||
ts.reset();
|
||||
while(ts.incrementToken());
|
||||
}
|
||||
String[] sentences = line.split("、|。");
|
||||
if (VERBOSE) {
|
||||
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
|
||||
System.out.println("Test for Bocchan with pre-splitting sentences (" + sentences.length + " sentences)");
|
||||
}
|
||||
totalStart = System.currentTimeMillis();
|
||||
for (int i = 0; i < numIterations; i++) {
|
||||
for (String sentence: sentences) {
|
||||
final TokenStream ts = analyzer.tokenStream("ignored", new StringReader(sentence));
|
||||
ts.reset();
|
||||
while(ts.incrementToken());
|
||||
}
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,20 +27,19 @@ import java.io.Reader;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
public class TestSearchMode extends BaseTokenStreamTestCase {
|
||||
private final static String SEGMENTATION_FILENAME = "search-segmentation-tests.txt";
|
||||
private final Segmenter segmenter = new Segmenter(Mode.SEARCH);
|
||||
private final Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(segmenter, reader);
|
||||
Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, Mode.SEARCH);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/** Test search mode segmentation */
|
||||
public void testSearchSegmentation() throws IOException {
|
||||
InputStream is = TestSearchMode.class.getResourceAsStream(SEGMENTATION_FILENAME);
|
||||
|
@ -63,7 +62,18 @@ public class TestSearchMode extends BaseTokenStreamTestCase {
|
|||
String[] fields = line.split("\t", 2);
|
||||
String sourceText = fields[0];
|
||||
String[] expectedTokens = fields[1].split("\\s+");
|
||||
assertAnalyzesTo(analyzer, sourceText, expectedTokens);
|
||||
int[] expectedPosIncrs = new int[expectedTokens.length];
|
||||
int[] expectedPosLengths = new int[expectedTokens.length];
|
||||
for(int tokIDX=0;tokIDX<expectedTokens.length;tokIDX++) {
|
||||
if (expectedTokens[tokIDX].endsWith("/0")) {
|
||||
expectedTokens[tokIDX] = expectedTokens[tokIDX].replace("/0", "");
|
||||
expectedPosLengths[tokIDX] = expectedTokens.length-1;
|
||||
} else {
|
||||
expectedPosIncrs[tokIDX] = 1;
|
||||
expectedPosLengths[tokIDX] = 1;
|
||||
}
|
||||
}
|
||||
assertAnalyzesTo(analyzer, sourceText, expectedTokens, expectedPosIncrs);
|
||||
}
|
||||
} finally {
|
||||
is.close();
|
||||
|
|
|
@ -23,29 +23,17 @@ import java.io.InputStreamReader;
|
|||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.kuromoji.SegmenterTest;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
|
||||
import org.apache.lucene.analysis.kuromoji.TestKuromojiTokenizer;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
public class UserDictionaryTest extends LuceneTestCase {
|
||||
|
||||
private UserDictionary readDict() throws IOException {
|
||||
InputStream is = SegmenterTest.class.getResourceAsStream("userdict.txt");
|
||||
if (is == null)
|
||||
throw new FileNotFoundException("Cannot find userdict.txt in test classpath!");
|
||||
try {
|
||||
Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
|
||||
return new UserDictionary(reader);
|
||||
} finally {
|
||||
is.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLookup() throws IOException {
|
||||
UserDictionary dictionary = readDict();
|
||||
UserDictionary dictionary = TestKuromojiTokenizer.readDict();
|
||||
String s = "関西国際空港に行った";
|
||||
int[][] dictionaryEntryResult = dictionary.lookup(s.toCharArray(), 0, s.length());
|
||||
// Length should be three 関西, 国際, 空港
|
||||
|
@ -69,7 +57,7 @@ public class UserDictionaryTest extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testReadings() throws IOException {
|
||||
UserDictionary dictionary = readDict();
|
||||
UserDictionary dictionary = TestKuromojiTokenizer.readDict();
|
||||
int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
|
||||
assertEquals(3, result.length);
|
||||
int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
|
||||
|
@ -83,7 +71,7 @@ public class UserDictionaryTest extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testPartOfSpeech() throws IOException {
|
||||
UserDictionary dictionary = readDict();
|
||||
UserDictionary dictionary = TestKuromojiTokenizer.readDict();
|
||||
int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
|
||||
assertEquals(3, result.length);
|
||||
int wordIdKeizai = result[1][0]; // wordId of 経済 in 日本経済新聞
|
||||
|
@ -92,7 +80,7 @@ public class UserDictionaryTest extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testRead() throws IOException {
|
||||
UserDictionary dictionary = readDict();
|
||||
UserDictionary dictionary = TestKuromojiTokenizer.readDict();
|
||||
assertNotNull(dictionary);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,43 +25,45 @@
|
|||
##
|
||||
|
||||
# Kansai Internationl Airport
|
||||
関西国際空港 関西 国際 空港
|
||||
関西国際空港 関西 関西国際空港/0 国際 空港
|
||||
# Narita Airport
|
||||
成田空港 成田 空港
|
||||
成田空港 成田 成田空港/0 空港
|
||||
# Haneda Airport
|
||||
羽田空港 羽田 空港
|
||||
羽田空港 羽田 羽田空港/0 空港
|
||||
# Nara Institute of Science and Technology
|
||||
奈良先端科学技術大学院大学 奈良 先端 科学 技術 大学院 大学
|
||||
奈良先端科学技術大学院大学 奈良 奈良先端科学技術大学院大学/0 先端 科学 技術 大学院 大学
|
||||
# Tokyo University
|
||||
東京大学 東京 大学
|
||||
東京大学 東京 東京大学/0 大学
|
||||
# Kyoto University
|
||||
京都大学 京都 大学
|
||||
京都大学 京都 京都大学/0 大学
|
||||
|
||||
# NOTE: differs from non-compound mode:
|
||||
# Kyoto University Baseball Club
|
||||
京都大学硬式野球部 京都 大学 硬式 野球 部
|
||||
京都大学硬式野球部 京都大 学 硬式 野球 部
|
||||
|
||||
##
|
||||
## Katakana titles
|
||||
##
|
||||
|
||||
# Senior Software Engineer
|
||||
シニアソフトウェアエンジニア シニア ソフトウェア エンジニア
|
||||
シニアソフトウェアエンジニア シニア シニアソフトウェアエンジニア/0 ソフトウェア エンジニア
|
||||
# Software Engineer
|
||||
ソフトウェアエンジニア ソフトウェア エンジニア
|
||||
# Senior Project Manager
|
||||
シニアプロジェクトマネジャー シニア プロジェクト マネジャー
|
||||
シニアプロジェクトマネジャー シニア シニアプロジェクトマネジャー/0 プロジェクト マネジャー
|
||||
# Project Manager
|
||||
プロジェクトマネジャー プロジェクト マネジャー
|
||||
# Senior Sales Engineer
|
||||
シニアセールスエンジニア シニア セールス エンジニア
|
||||
シニアセールスエンジニア シニア シニアセールスエンジニア/0 セールス エンジニア
|
||||
# System Architect
|
||||
システムアーキテクト システム アーキテクト
|
||||
システムアーキテクト システム システムアーキテクト/0 アーキテクト
|
||||
# Senior System Architect
|
||||
シニアシステムアーキテクト シニア システム アーキテクト
|
||||
シニアシステムアーキテクト シニア シニアシステムアーキテクト/0 システム アーキテクト
|
||||
# System Administrator
|
||||
システムアドミニストレータ システム アドミニストレータ
|
||||
システムアドミニストレーター システム アドミニストレーター
|
||||
システムアドミニストレーター システム システムアドミニストレーター/0 アドミニストレーター
|
||||
# Senior System Administrator
|
||||
シニアシステムアドミニストレーター シニア システム アドミニストレーター
|
||||
シニアシステムアドミニストレーター シニア シニアシステムアドミニストレーター/0 システム アドミニストレーター
|
||||
|
||||
##
|
||||
## Company names (several are fictitious)
|
||||
|
@ -70,25 +72,25 @@
|
|||
# SoftBank Mobile
|
||||
ソフトバンクモバイル ソフトバンク モバイル
|
||||
# Alpine Materials
|
||||
アルパインマテリアルズ アルパイン マテリアルズ
|
||||
アルパインマテリアルズ アルパイン アルパインマテリアルズ/0 マテリアルズ
|
||||
# Sapporo Holdings
|
||||
サッポロホールディングス サッポロ ホールディングス
|
||||
# Yamada Corporation
|
||||
ヤマダコーポレーション ヤマダ コーポレーション
|
||||
ヤマダコーポレーション ヤマダ ヤマダコーポレーション/0 コーポレーション
|
||||
# Canon Semiconductor equipement NOTE: Semiconductor becomes semi + conductor
|
||||
キヤノンセミコンダクターエクィップメント キヤノン セミ コンダクター エクィップメント
|
||||
キヤノンセミコンダクターエクィップメント キヤノン キヤノンセミコンダクターエクィップメント/0 セミ コンダクター エクィップメント
|
||||
# Orental Chain
|
||||
オリエンタルチエン オリエンタル チエン
|
||||
オリエンタルチエン オリエンタル オリエンタルチエン/0 チエン
|
||||
# Ally Projects Japan NOTE: Becomes one token as プロジェクツ is not in IPADIC
|
||||
アーリープロジェクツジャパン アーリープロジェクツジャパン
|
||||
# Peter Pan Corporation
|
||||
ピーターパンコーポレーション ピーター パン コーポレーション
|
||||
ピーターパンコーポレーション ピーター ピーターパンコーポレーション/0 パン コーポレーション
|
||||
# AIM Create
|
||||
エイムクリエイツ エイムクリエイツ
|
||||
# Mars Engineering
|
||||
マースエンジニアリング マース エンジニアリング
|
||||
マースエンジニアリング マース マースエンジニアリング/0 エンジニアリング
|
||||
# Fuji Protein Technology
|
||||
フジプロテインテクノロジー フジ プロテイン テクノロジー
|
||||
フジプロテインテクノロジー フジ フジプロテインテクノロジー/0 プロテイン テクノロジー
|
||||
|
||||
##
|
||||
## Person names
|
||||
|
@ -100,7 +102,7 @@
|
|||
スティーブジョブズ スティーブ ジョブズ
|
||||
# Harry Potter NOTE: Becomes one token (short word)
|
||||
ハリーポッター ハリーポッター
|
||||
# Bill Gates NOTE: Becomes one token (short work)
|
||||
# Bill Gates NOTE: Becomes one token (short word)
|
||||
ビルゲイツ ビルゲイツ
|
||||
# Sean Connery NOTE: Becomes one token (okay)
|
||||
ショーンコネリー ショーンコネリー
|
||||
|
@ -133,8 +135,8 @@
|
|||
##
|
||||
|
||||
# JT Engineering NOTE: Becomes J Tien ginia ring (substrings are in IPADIC)
|
||||
ジェイティエンジニアリング ジェイ ティエン ジニア リング
|
||||
ジェイティエンジニアリング ジェイ ジェイティエンジニアリング/0 ティエン ジニア リング
|
||||
# Anchovy pasta NOTE: Become Anch yvipasta
|
||||
アンチョビパスタ アンチ ョビパスタ
|
||||
アンチョビパスタ アンチ アンチョビパスタ/0 ョビパスタ
|
||||
# Surprise gift NOTE: Becomes one token (surprise not in IPADIC)
|
||||
サプライズギフト サプライズギフト
|
||||
|
|
|
@ -4,3 +4,7 @@
|
|||
|
||||
# Custom reading for sumo wrestler
|
||||
朝青龍,朝青龍,アサショウリュウ,カスタム人名
|
||||
|
||||
# Silly entry:
|
||||
abcd,a b cd,foo1 foo2 foo3,bar
|
||||
abcdefg,ab cd efg,foo1 foo2 foo4,bar
|
||||
|
|
|
@ -28,8 +28,7 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer;
|
||||
import org.apache.lucene.analysis.kuromoji.Segmenter;
|
||||
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
|
||||
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
|
||||
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.solr.analysis.BaseTokenizerFactory;
|
||||
|
@ -88,7 +87,7 @@ public class KuromojiTokenizerFactory extends BaseTokenizerFactory implements Re
|
|||
|
||||
@Override
|
||||
public Tokenizer create(Reader input) {
|
||||
return new KuromojiTokenizer(new Segmenter(userDictionary, mode), input);
|
||||
return new KuromojiTokenizer(input, userDictionary, true, mode);
|
||||
}
|
||||
|
||||
private Mode getMode(Map<String, String> args) {
|
||||
|
@ -96,7 +95,7 @@ public class KuromojiTokenizerFactory extends BaseTokenizerFactory implements Re
|
|||
if (mode != null) {
|
||||
return Mode.valueOf(mode.toUpperCase(Locale.ENGLISH));
|
||||
} else {
|
||||
return Segmenter.DEFAULT_MODE;
|
||||
return KuromojiTokenizer.DEFAULT_MODE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -50,7 +50,7 @@ public class TestKuromojiTokenizerFactory extends BaseTokenTestCase {
|
|||
factory.inform(new SolrResourceLoader(null, null));
|
||||
TokenStream ts = factory.create(new StringReader("シニアソフトウェアエンジニア"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "シニア", "ソフトウェア", "エンジニア" }
|
||||
new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" }
|
||||
);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue