mirror of
https://github.com/apache/lucene.git
synced 2025-02-20 08:56:03 +00:00
LUCENE-3699: share baseform with surface and flag if the reading can be computed from surface
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232265 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c902f63125
commit
48c01e5a2b
modules/analysis/kuromoji/src
java/org/apache/lucene/analysis/kuromoji
resources/org/apache/lucene/analysis/kuromoji/dict
test/org/apache/lucene/analysis/kuromoji/dict
tools/java/org/apache/lucene/analysis/kuromoji/util
@ -75,14 +75,14 @@ public class Token {
|
||||
* @return reading. null if token doesn't have reading.
|
||||
*/
|
||||
public String getReading() {
|
||||
return dictionary.getReading(wordId);
|
||||
return dictionary.getReading(wordId, surfaceForm, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return pronunciation. null if token doesn't have pronunciation.
|
||||
*/
|
||||
public String getPronunciation() {
|
||||
return dictionary.getPronunciation(wordId);
|
||||
return dictionary.getPronunciation(wordId, surfaceForm, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -110,7 +110,7 @@ public class Token {
|
||||
* @return base form or null if token is not inflected
|
||||
*/
|
||||
public String getBaseForm() {
|
||||
return dictionary.getBaseForm(wordId);
|
||||
return dictionary.getBaseForm(wordId, surfaceForm, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
|
71
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
71
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
@ -149,12 +149,12 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||
|
||||
@Override
|
||||
public int getLeftId(int wordId) {
|
||||
return buffer.getShort(wordId) >>> 2;
|
||||
return buffer.getShort(wordId) >>> 3;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getRightId(int wordId) {
|
||||
return buffer.getShort(wordId) >>> 2;
|
||||
return buffer.getShort(wordId) >>> 3;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -163,21 +163,42 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getBaseForm(int wordId) {
|
||||
public String getBaseForm(int wordId, char surfaceForm[], int off, int len) {
|
||||
if (hasBaseFormData(wordId)) {
|
||||
int offset = baseFormOffset(wordId);
|
||||
int length = buffer.get(offset++) & 0xff;
|
||||
return readString(offset, length, false);
|
||||
int data = buffer.get(offset++) & 0xff;
|
||||
int prefix = data >>> 4;
|
||||
int suffix = data & 0xF;
|
||||
char text[] = new char[prefix+suffix];
|
||||
System.arraycopy(surfaceForm, off, text, 0, prefix);
|
||||
for (int i = 0; i < suffix; i++) {
|
||||
text[prefix+i] = buffer.getChar(offset + (i << 1));
|
||||
}
|
||||
return new String(text);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getReading(int wordId) {
|
||||
int offset = readingOffset(wordId);
|
||||
int readingData = buffer.get(offset++) & 0xff;
|
||||
return readString(offset, readingData >>> 1, (readingData & 1) == 1);
|
||||
public String getReading(int wordId, char surface[], int off, int len) {
|
||||
if (hasReadingData(wordId)) {
|
||||
int offset = readingOffset(wordId);
|
||||
int readingData = buffer.get(offset++) & 0xff;
|
||||
return readString(offset, readingData >>> 1, (readingData & 1) == 1);
|
||||
} else {
|
||||
// the reading is the surface form, with hiragana shifted to katakana
|
||||
char text[] = new char[len];
|
||||
for (int i = 0; i < len; i++) {
|
||||
char ch = surface[off+i];
|
||||
if (ch > 0x3040 && ch < 0x3097) {
|
||||
text[i] = (char)(ch + 0x60);
|
||||
} else {
|
||||
text[i] = ch;
|
||||
}
|
||||
}
|
||||
return new String(text);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -186,13 +207,13 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPronunciation(int wordId) {
|
||||
public String getPronunciation(int wordId, char surface[], int off, int len) {
|
||||
if (hasPronunciationData(wordId)) {
|
||||
int offset = pronunciationOffset(wordId);
|
||||
int pronunciationData = buffer.get(offset++) & 0xff;
|
||||
return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
|
||||
} else {
|
||||
return getReading(wordId); // same as the reading
|
||||
return getReading(wordId, surface, off, len); // same as the reading
|
||||
}
|
||||
}
|
||||
|
||||
@ -213,7 +234,7 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||
private int readingOffset(int wordId) {
|
||||
int offset = baseFormOffset(wordId);
|
||||
if (hasBaseFormData(wordId)) {
|
||||
int baseFormLength = buffer.get(offset++) & 0xff;
|
||||
int baseFormLength = buffer.get(offset++) & 0xf;
|
||||
return offset + (baseFormLength << 1);
|
||||
} else {
|
||||
return offset;
|
||||
@ -221,21 +242,29 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||
}
|
||||
|
||||
private int pronunciationOffset(int wordId) {
|
||||
int offset = readingOffset(wordId);
|
||||
int readingData = buffer.get(offset++) & 0xff;
|
||||
final int readingLength;
|
||||
if ((readingData & 1) == 0) {
|
||||
readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
|
||||
if (hasReadingData(wordId)) {
|
||||
int offset = readingOffset(wordId);
|
||||
int readingData = buffer.get(offset++) & 0xff;
|
||||
final int readingLength;
|
||||
if ((readingData & 1) == 0) {
|
||||
readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
|
||||
} else {
|
||||
readingLength = readingData >>> 1;
|
||||
}
|
||||
return offset + readingLength;
|
||||
} else {
|
||||
readingLength = readingData >>> 1;
|
||||
return readingOffset(wordId);
|
||||
}
|
||||
return offset + readingLength;
|
||||
}
|
||||
|
||||
private boolean hasBaseFormData(int wordId) {
|
||||
return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
|
||||
}
|
||||
|
||||
private boolean hasReadingData(int wordId) {
|
||||
return (buffer.getShort(wordId) & HAS_READING) != 0;
|
||||
}
|
||||
|
||||
private boolean hasPronunciationData(int wordId) {
|
||||
return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
|
||||
}
|
||||
@ -256,6 +285,8 @@ public abstract class BinaryDictionary implements Dictionary {
|
||||
|
||||
/** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
|
||||
public static final int HAS_BASEFORM = 1;
|
||||
/** flag that the entry has reading data. otherwise reading is surface form converted to katakana */
|
||||
public static final int HAS_READING = 2;
|
||||
/** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
|
||||
public static final int HAS_PRONUNCIATION = 2;
|
||||
public static final int HAS_PRONUNCIATION = 4;
|
||||
}
|
||||
|
@ -54,21 +54,21 @@ public interface Dictionary {
|
||||
* @param wordId word ID of token
|
||||
* @return Reading of the token
|
||||
*/
|
||||
public String getReading(int wordId);
|
||||
public String getReading(int wordId, char surface[], int off, int len);
|
||||
|
||||
/**
|
||||
* Get base form of word
|
||||
* @param wordId word ID of token
|
||||
* @return Base form (only different for inflected words, otherwise null)
|
||||
*/
|
||||
public String getBaseForm(int wordId);
|
||||
public String getBaseForm(int wordId, char surface[], int off, int len);
|
||||
|
||||
/**
|
||||
* Get pronunciation of tokens
|
||||
* @param wordId word ID of token
|
||||
* @return Pronunciation of the token
|
||||
*/
|
||||
public String getPronunciation(int wordId);
|
||||
public String getPronunciation(int wordId, char surface[], int off, int len);
|
||||
|
||||
/**
|
||||
* Get inflection type of tokens
|
||||
|
2
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
2
modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
@ -51,7 +51,7 @@ public final class UnknownDictionary extends BinaryDictionary {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getReading(int wordId) {
|
||||
public String getReading(int wordId, char surface[], int off, int len) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
@ -196,7 +196,7 @@ public final class UserDictionary implements Dictionary {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getReading(int wordId) {
|
||||
public String getReading(int wordId, char surface[], int off, int len) {
|
||||
return getFeature(wordId, 0);
|
||||
}
|
||||
|
||||
@ -206,12 +206,12 @@ public final class UserDictionary implements Dictionary {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getBaseForm(int wordId) {
|
||||
public String getBaseForm(int wordId, char surface[], int off, int len) {
|
||||
return null; // TODO: add support?
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPronunciation(int wordId) {
|
||||
public String getPronunciation(int wordId, char surface[], int off, int len) {
|
||||
return null; // TODO: add support?
|
||||
}
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -61,7 +61,7 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
|
||||
assertTrue(wordId > lastWordId);
|
||||
lastWordId = wordId;
|
||||
|
||||
String baseForm = tid.getBaseForm(wordId);
|
||||
String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
|
||||
assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
|
||||
|
||||
String inflectionForm = tid.getInflectionForm(wordId);
|
||||
@ -91,11 +91,11 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
|
||||
// check that its actually an ipadic pos tag
|
||||
assertNotNull(ToStringUtil.getPOSTranslation(pos));
|
||||
|
||||
String pronunciation = tid.getPronunciation(wordId);
|
||||
String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
|
||||
assertNotNull(pronunciation);
|
||||
assertTrue(UnicodeUtil.validUTF16String(pronunciation));
|
||||
|
||||
String reading = tid.getReading(wordId);
|
||||
String reading = tid.getReading(wordId, chars, 0, chars.length);
|
||||
assertNotNull(reading);
|
||||
assertTrue(UnicodeUtil.validUTF16String(reading));
|
||||
}
|
||||
|
4
modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
4
modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
@ -73,12 +73,12 @@ public class UserDictionaryTest extends LuceneTestCase {
|
||||
int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
|
||||
assertEquals(3, result.length);
|
||||
int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
|
||||
assertEquals("ニホン", dictionary.getReading(wordIdNihon));
|
||||
assertEquals("ニホン", dictionary.getReading(wordIdNihon, "日本".toCharArray(), 0, 2));
|
||||
|
||||
result = dictionary.lookup("朝青龍".toCharArray(), 0, 3);
|
||||
assertEquals(1, result.length);
|
||||
int wordIdAsashoryu = result[0][0]; // wordId for 朝青龍
|
||||
assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu));
|
||||
assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu, "朝青龍".toCharArray(), 0, 3));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -103,12 +103,15 @@ public abstract class BinaryDictionaryWriter {
|
||||
if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
|
||||
flags |= BinaryDictionary.HAS_BASEFORM;
|
||||
}
|
||||
if (!reading.equals(toKatakana(entry[0]))) {
|
||||
flags |= BinaryDictionary.HAS_READING;
|
||||
}
|
||||
if (!pronunciation.equals(reading)) {
|
||||
flags |= BinaryDictionary.HAS_PRONUNCIATION;
|
||||
}
|
||||
|
||||
assert leftId == rightId;
|
||||
assert leftId < 8192; // there are still unused bits
|
||||
assert leftId < 4096; // there are still unused bits
|
||||
// add pos mapping
|
||||
int toFill = 1+leftId - posDict.size();
|
||||
for (int i = 0; i < toFill; i++) {
|
||||
@ -119,27 +122,36 @@ public abstract class BinaryDictionaryWriter {
|
||||
assert existing == null || existing.equals(fullPOSData);
|
||||
posDict.set(leftId, fullPOSData);
|
||||
|
||||
buffer.putShort((short)(leftId << 2 | flags));
|
||||
buffer.putShort((short)(leftId << 3 | flags));
|
||||
buffer.putShort(wordCost);
|
||||
|
||||
if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
|
||||
buffer.put((byte) baseForm.length());
|
||||
for (int i = 0; i < baseForm.length(); i++) {
|
||||
assert baseForm.length() < 16;
|
||||
int shared = sharedPrefix(entry[0], baseForm);
|
||||
int suffix = baseForm.length() - shared;
|
||||
buffer.put((byte) (shared << 4 | suffix));
|
||||
for (int i = shared; i < baseForm.length(); i++) {
|
||||
buffer.putChar(baseForm.charAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
if (isKatakana(reading)) {
|
||||
buffer.put((byte) (reading.length() << 1 | 1));
|
||||
writeKatakana(reading);
|
||||
} else {
|
||||
buffer.put((byte) (reading.length() << 1));
|
||||
for (int i = 0; i < reading.length(); i++) {
|
||||
buffer.putChar(reading.charAt(i));
|
||||
if ((flags & BinaryDictionary.HAS_READING) != 0) {
|
||||
if (isKatakana(reading)) {
|
||||
buffer.put((byte) (reading.length() << 1 | 1));
|
||||
writeKatakana(reading);
|
||||
} else {
|
||||
buffer.put((byte) (reading.length() << 1));
|
||||
for (int i = 0; i < reading.length(); i++) {
|
||||
buffer.putChar(reading.charAt(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
|
||||
// we can save 150KB here, but it makes the reader a little complicated.
|
||||
// int shared = sharedPrefix(reading, pronunciation);
|
||||
// buffer.put((byte) shared);
|
||||
// pronunciation = pronunciation.substring(shared);
|
||||
if (isKatakana(pronunciation)) {
|
||||
buffer.put((byte) (pronunciation.length() << 1 | 1));
|
||||
writeKatakana(pronunciation);
|
||||
@ -170,6 +182,27 @@ public abstract class BinaryDictionaryWriter {
|
||||
}
|
||||
}
|
||||
|
||||
private String toKatakana(String s) {
|
||||
char text[] = new char[s.length()];
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
char ch = s.charAt(i);
|
||||
if (ch > 0x3040 && ch < 0x3097) {
|
||||
text[i] = (char)(ch + 0x60);
|
||||
} else {
|
||||
text[i] = ch;
|
||||
}
|
||||
}
|
||||
return new String(text);
|
||||
}
|
||||
|
||||
public static int sharedPrefix(String left, String right) {
|
||||
int len = left.length() < right.length() ? left.length() : right.length();
|
||||
for (int i = 0; i < len; i++)
|
||||
if (left.charAt(i) != right.charAt(i))
|
||||
return i;
|
||||
return len;
|
||||
}
|
||||
|
||||
public void addMapping(int sourceId, int wordId) {
|
||||
assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user