1
0
mirror of https://github.com/apache/lucene.git synced 2025-02-20 08:56:03 +00:00

LUCENE-3699: share baseform with surface and flag if the reading can be computed from surface

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232265 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-17 02:12:27 +00:00
parent c902f63125
commit 48c01e5a2b
11 changed files with 110 additions and 46 deletions
modules/analysis/kuromoji/src
java/org/apache/lucene/analysis/kuromoji
resources/org/apache/lucene/analysis/kuromoji/dict
test/org/apache/lucene/analysis/kuromoji/dict
tools/java/org/apache/lucene/analysis/kuromoji/util

@ -75,14 +75,14 @@ public class Token {
* @return reading. null if token doesn't have reading.
*/
public String getReading() {
return dictionary.getReading(wordId);
return dictionary.getReading(wordId, surfaceForm, offset, length);
}
/**
* @return pronunciation. null if token doesn't have pronunciation.
*/
public String getPronunciation() {
return dictionary.getPronunciation(wordId);
return dictionary.getPronunciation(wordId, surfaceForm, offset, length);
}
/**
@ -110,7 +110,7 @@ public class Token {
* @return base form or null if token is not inflected
*/
public String getBaseForm() {
return dictionary.getBaseForm(wordId);
return dictionary.getBaseForm(wordId, surfaceForm, offset, length);
}
/**

@ -149,12 +149,12 @@ public abstract class BinaryDictionary implements Dictionary {
@Override
public int getLeftId(int wordId) {
return buffer.getShort(wordId) >>> 2;
return buffer.getShort(wordId) >>> 3;
}
@Override
public int getRightId(int wordId) {
return buffer.getShort(wordId) >>> 2;
return buffer.getShort(wordId) >>> 3;
}
@Override
@ -163,21 +163,42 @@ public abstract class BinaryDictionary implements Dictionary {
}
@Override
public String getBaseForm(int wordId) {
public String getBaseForm(int wordId, char surfaceForm[], int off, int len) {
if (hasBaseFormData(wordId)) {
int offset = baseFormOffset(wordId);
int length = buffer.get(offset++) & 0xff;
return readString(offset, length, false);
int data = buffer.get(offset++) & 0xff;
int prefix = data >>> 4;
int suffix = data & 0xF;
char text[] = new char[prefix+suffix];
System.arraycopy(surfaceForm, off, text, 0, prefix);
for (int i = 0; i < suffix; i++) {
text[prefix+i] = buffer.getChar(offset + (i << 1));
}
return new String(text);
} else {
return null;
}
}
@Override
public String getReading(int wordId) {
int offset = readingOffset(wordId);
int readingData = buffer.get(offset++) & 0xff;
return readString(offset, readingData >>> 1, (readingData & 1) == 1);
public String getReading(int wordId, char surface[], int off, int len) {
if (hasReadingData(wordId)) {
int offset = readingOffset(wordId);
int readingData = buffer.get(offset++) & 0xff;
return readString(offset, readingData >>> 1, (readingData & 1) == 1);
} else {
// the reading is the surface form, with hiragana shifted to katakana
char text[] = new char[len];
for (int i = 0; i < len; i++) {
char ch = surface[off+i];
if (ch > 0x3040 && ch < 0x3097) {
text[i] = (char)(ch + 0x60);
} else {
text[i] = ch;
}
}
return new String(text);
}
}
@Override
@ -186,13 +207,13 @@ public abstract class BinaryDictionary implements Dictionary {
}
@Override
public String getPronunciation(int wordId) {
public String getPronunciation(int wordId, char surface[], int off, int len) {
if (hasPronunciationData(wordId)) {
int offset = pronunciationOffset(wordId);
int pronunciationData = buffer.get(offset++) & 0xff;
return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
} else {
return getReading(wordId); // same as the reading
return getReading(wordId, surface, off, len); // same as the reading
}
}
@ -213,7 +234,7 @@ public abstract class BinaryDictionary implements Dictionary {
private int readingOffset(int wordId) {
int offset = baseFormOffset(wordId);
if (hasBaseFormData(wordId)) {
int baseFormLength = buffer.get(offset++) & 0xff;
int baseFormLength = buffer.get(offset++) & 0xf;
return offset + (baseFormLength << 1);
} else {
return offset;
@ -221,21 +242,29 @@ public abstract class BinaryDictionary implements Dictionary {
}
private int pronunciationOffset(int wordId) {
int offset = readingOffset(wordId);
int readingData = buffer.get(offset++) & 0xff;
final int readingLength;
if ((readingData & 1) == 0) {
readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
if (hasReadingData(wordId)) {
int offset = readingOffset(wordId);
int readingData = buffer.get(offset++) & 0xff;
final int readingLength;
if ((readingData & 1) == 0) {
readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
} else {
readingLength = readingData >>> 1;
}
return offset + readingLength;
} else {
readingLength = readingData >>> 1;
return readingOffset(wordId);
}
return offset + readingLength;
}
private boolean hasBaseFormData(int wordId) {
return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
}
private boolean hasReadingData(int wordId) {
return (buffer.getShort(wordId) & HAS_READING) != 0;
}
private boolean hasPronunciationData(int wordId) {
return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
}
@ -256,6 +285,8 @@ public abstract class BinaryDictionary implements Dictionary {
/** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
public static final int HAS_BASEFORM = 1;
/** flag that the entry has reading data. otherwise reading is surface form converted to katakana */
public static final int HAS_READING = 2;
/** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
public static final int HAS_PRONUNCIATION = 2;
public static final int HAS_PRONUNCIATION = 4;
}

@ -54,21 +54,21 @@ public interface Dictionary {
* @param wordId word ID of token
* @return Reading of the token
*/
public String getReading(int wordId);
public String getReading(int wordId, char surface[], int off, int len);
/**
* Get base form of word
* @param wordId word ID of token
* @return Base form (only different for inflected words, otherwise null)
*/
public String getBaseForm(int wordId);
public String getBaseForm(int wordId, char surface[], int off, int len);
/**
* Get pronunciation of tokens
* @param wordId word ID of token
* @return Pronunciation of the token
*/
public String getPronunciation(int wordId);
public String getPronunciation(int wordId, char surface[], int off, int len);
/**
* Get inflection type of tokens

@ -51,7 +51,7 @@ public final class UnknownDictionary extends BinaryDictionary {
}
@Override
public String getReading(int wordId) {
public String getReading(int wordId, char surface[], int off, int len) {
return null;
}

@ -196,7 +196,7 @@ public final class UserDictionary implements Dictionary {
}
@Override
public String getReading(int wordId) {
public String getReading(int wordId, char surface[], int off, int len) {
return getFeature(wordId, 0);
}
@ -206,12 +206,12 @@ public final class UserDictionary implements Dictionary {
}
@Override
public String getBaseForm(int wordId) {
public String getBaseForm(int wordId, char surface[], int off, int len) {
return null; // TODO: add support?
}
@Override
public String getPronunciation(int wordId) {
public String getPronunciation(int wordId, char surface[], int off, int len) {
return null; // TODO: add support?
}

@ -61,7 +61,7 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
assertTrue(wordId > lastWordId);
lastWordId = wordId;
String baseForm = tid.getBaseForm(wordId);
String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
String inflectionForm = tid.getInflectionForm(wordId);
@ -91,11 +91,11 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
// check that its actually an ipadic pos tag
assertNotNull(ToStringUtil.getPOSTranslation(pos));
String pronunciation = tid.getPronunciation(wordId);
String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
assertNotNull(pronunciation);
assertTrue(UnicodeUtil.validUTF16String(pronunciation));
String reading = tid.getReading(wordId);
String reading = tid.getReading(wordId, chars, 0, chars.length);
assertNotNull(reading);
assertTrue(UnicodeUtil.validUTF16String(reading));
}

@ -73,12 +73,12 @@ public class UserDictionaryTest extends LuceneTestCase {
int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
assertEquals(3, result.length);
int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
assertEquals("ニホン", dictionary.getReading(wordIdNihon));
assertEquals("ニホン", dictionary.getReading(wordIdNihon, "日本".toCharArray(), 0, 2));
result = dictionary.lookup("朝青龍".toCharArray(), 0, 3);
assertEquals(1, result.length);
int wordIdAsashoryu = result[0][0]; // wordId for 朝青龍
assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu));
assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu, "朝青龍".toCharArray(), 0, 3));
}
@Test

@ -103,12 +103,15 @@ public abstract class BinaryDictionaryWriter {
if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
flags |= BinaryDictionary.HAS_BASEFORM;
}
if (!reading.equals(toKatakana(entry[0]))) {
flags |= BinaryDictionary.HAS_READING;
}
if (!pronunciation.equals(reading)) {
flags |= BinaryDictionary.HAS_PRONUNCIATION;
}
assert leftId == rightId;
assert leftId < 8192; // there are still unused bits
assert leftId < 4096; // there are still unused bits
// add pos mapping
int toFill = 1+leftId - posDict.size();
for (int i = 0; i < toFill; i++) {
@ -119,27 +122,36 @@ public abstract class BinaryDictionaryWriter {
assert existing == null || existing.equals(fullPOSData);
posDict.set(leftId, fullPOSData);
buffer.putShort((short)(leftId << 2 | flags));
buffer.putShort((short)(leftId << 3 | flags));
buffer.putShort(wordCost);
if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
buffer.put((byte) baseForm.length());
for (int i = 0; i < baseForm.length(); i++) {
assert baseForm.length() < 16;
int shared = sharedPrefix(entry[0], baseForm);
int suffix = baseForm.length() - shared;
buffer.put((byte) (shared << 4 | suffix));
for (int i = shared; i < baseForm.length(); i++) {
buffer.putChar(baseForm.charAt(i));
}
}
if (isKatakana(reading)) {
buffer.put((byte) (reading.length() << 1 | 1));
writeKatakana(reading);
} else {
buffer.put((byte) (reading.length() << 1));
for (int i = 0; i < reading.length(); i++) {
buffer.putChar(reading.charAt(i));
if ((flags & BinaryDictionary.HAS_READING) != 0) {
if (isKatakana(reading)) {
buffer.put((byte) (reading.length() << 1 | 1));
writeKatakana(reading);
} else {
buffer.put((byte) (reading.length() << 1));
for (int i = 0; i < reading.length(); i++) {
buffer.putChar(reading.charAt(i));
}
}
}
if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
// we can save 150KB here, but it makes the reader a little complicated.
// int shared = sharedPrefix(reading, pronunciation);
// buffer.put((byte) shared);
// pronunciation = pronunciation.substring(shared);
if (isKatakana(pronunciation)) {
buffer.put((byte) (pronunciation.length() << 1 | 1));
writeKatakana(pronunciation);
@ -170,6 +182,27 @@ public abstract class BinaryDictionaryWriter {
}
}
private String toKatakana(String s) {
char text[] = new char[s.length()];
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
if (ch > 0x3040 && ch < 0x3097) {
text[i] = (char)(ch + 0x60);
} else {
text[i] = ch;
}
}
return new String(text);
}
public static int sharedPrefix(String left, String right) {
int len = left.length() < right.length() ? left.length() : right.length();
for (int i = 0; i < len; i++)
if (left.charAt(i) != right.charAt(i))
return i;
return len;
}
public void addMapping(int sourceId, int wordId) {
assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;