LUCENE-3699: simplify dictionary access and reduce tokeninfodictionary 1.5MB

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232120 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-16 19:19:48 +00:00
parent 297cc4a275
commit 12c9b8b4bf
12 changed files with 94 additions and 218 deletions

View File

@ -37,12 +37,10 @@ public abstract class BinaryDictionary implements Dictionary {
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
public static final String INFLDICT_FILENAME_SUFFIX = "$inflDict.dat";
public static final String DICT_HEADER = "kuromoji_dict";
public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
public static final String POSDICT_HEADER = "kuromoji_dict_pos";
public static final String INFLDICT_HEADER = "kuromoji_dict_infl";
public static final int VERSION = 1;
private final ByteBuffer buffer;
@ -52,7 +50,7 @@ public abstract class BinaryDictionary implements Dictionary {
private final String[] inflFormDict;
protected BinaryDictionary() throws IOException {
InputStream mapIS = null, dictIS = null, posIS = null, inflIS = null;
InputStream mapIS = null, dictIS = null, posIS = null;
IOException priorE = null;
int[] targetMapOffsets = null, targetMap = null;
String[] posDict = null;
@ -85,25 +83,24 @@ public abstract class BinaryDictionary implements Dictionary {
posIS = new BufferedInputStream(posIS);
in = new InputStreamDataInput(posIS);
CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
posDict = new String[in.readVInt()];
for (int j = 0; j < posDict.length; j++) {
int posSize = in.readVInt();
posDict = new String[posSize];
inflTypeDict = new String[posSize];
inflFormDict = new String[posSize];
for (int j = 0; j < posSize; j++) {
posDict[j] = in.readString();
inflTypeDict[j] = in.readString();
inflFormDict[j] = in.readString();
// this is how we encode null inflections
if (inflTypeDict[j].length() == 0) {
inflTypeDict[j] = null;
}
if (inflFormDict[j].length() == 0) {
inflFormDict[j] = null;
}
}
posIS.close(); posIS = null;
inflIS = getResource(INFLDICT_FILENAME_SUFFIX);
inflIS = new BufferedInputStream(inflIS);
in = new InputStreamDataInput(inflIS);
CodecUtil.checkHeader(in, INFLDICT_HEADER, VERSION, VERSION);
int length = in.readVInt();
inflTypeDict = new String[length];
inflFormDict = new String[length];
for (int j = 0; j < length; j++) {
inflTypeDict[j] = in.readString();
inflFormDict[j] = in.readString();
}
inflIS.close(); inflIS = null;
dictIS = getResource(DICT_FILENAME_SUFFIX);
// no buffering here, as we load in one large buffer
in = new InputStreamDataInput(dictIS);
@ -120,7 +117,7 @@ public abstract class BinaryDictionary implements Dictionary {
} catch (IOException ioe) {
priorE = ioe;
} finally {
IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, inflIS, dictIS);
IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, dictIS);
}
this.targetMap = targetMap;
@ -152,27 +149,27 @@ public abstract class BinaryDictionary implements Dictionary {
@Override
public int getLeftId(int wordId) {
return buffer.getShort(wordId);
return buffer.getShort(wordId) >>> 2;
}
@Override
public int getRightId(int wordId) {
return buffer.getShort(wordId + 2); // Skip left id
return buffer.getShort(wordId) >>> 2;
}
@Override
public int getWordCost(int wordId) {
return buffer.getShort(wordId + 4); // Skip left id and right id
return buffer.getShort(wordId + 2); // Skip id
}
@Override
public String getBaseForm(int wordId) {
int offset = baseFormOffset(wordId);
int length = (buffer.get(offset++) & 0xff) >>> 1;
if (length == 0) {
return null; // same as surface form
} else {
if (hasBaseFormData(wordId)) {
int offset = baseFormOffset(wordId);
int length = buffer.get(offset++) & 0xff;
return readString(offset, length, false);
} else {
return null;
}
}
@ -185,8 +182,7 @@ public abstract class BinaryDictionary implements Dictionary {
@Override
public String getPartOfSpeech(int wordId) {
int posIndex = buffer.get(posOffset(wordId)) & 0xff; // read index into posDict
return posDict[posIndex >>> 1];
return posDict[getLeftId(wordId)];
}
@Override
@ -202,28 +198,26 @@ public abstract class BinaryDictionary implements Dictionary {
@Override
public String getInflectionType(int wordId) {
int index = getInflectionIndex(wordId);
return index < 0 ? null : inflTypeDict[index];
return inflTypeDict[getLeftId(wordId)];
}
@Override
public String getInflectionForm(int wordId) {
int index = getInflectionIndex(wordId);
return index < 0 ? null : inflFormDict[index];
}
private static int posOffset(int wordId) {
return wordId + 6;
return inflFormDict[getLeftId(wordId)];
}
private static int baseFormOffset(int wordId) {
return wordId + 7;
return wordId + 4;
}
private int readingOffset(int wordId) {
int offset = baseFormOffset(wordId);
int baseFormLength = buffer.get(offset++) & 0xfe; // mask away pronunciation bit
return offset + baseFormLength;
if (hasBaseFormData(wordId)) {
int baseFormLength = buffer.get(offset++) & 0xff;
return offset + (baseFormLength << 1);
} else {
return offset;
}
}
private int pronunciationOffset(int wordId) {
@ -238,41 +232,12 @@ public abstract class BinaryDictionary implements Dictionary {
return offset + readingLength;
}
private boolean hasBaseFormData(int wordId) {
return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
}
private boolean hasPronunciationData(int wordId) {
int baseFormData = buffer.get(baseFormOffset(wordId)) & 0xff;
return (baseFormData & 1) == 0;
}
private boolean hasInflectionData(int wordId) {
int posData = buffer.get(posOffset(wordId)) & 0xff;
return (posData & 1) == 1;
}
private int getInflectionIndex(int wordId) {
if (!hasInflectionData(wordId)) {
return -1; // common case: no inflection data
}
// skip past reading/pronunciation at the end
int offset = hasPronunciationData(wordId) ? pronunciationOffset(wordId) : readingOffset(wordId);
int endData = buffer.get(offset++) & 0xff;
final int endLength;
if ((endData & 1) == 0) {
endLength = endData & 0xfe; // UTF-16: mask off kana bit
} else {
endLength = endData >>> 1;
}
offset += endLength;
byte b = buffer.get(offset++);
int i = b & 0x7F;
if ((b & 0x80) == 0) return i;
b = buffer.get(offset++);
i |= (b & 0x7F) << 7;
assert ((b & 0x80) == 0);
return i;
return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
}
private String readString(int offset, int length, boolean kana) {
@ -288,4 +253,9 @@ public abstract class BinaryDictionary implements Dictionary {
}
return new String(text);
}
/** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
public static final int HAS_BASEFORM = 1;
/** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
public static final int HAS_PRONUNCIATION = 2;
}

View File

@ -26,11 +26,6 @@ import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.OutputStreamDataOutput;
@ -45,11 +40,7 @@ public abstract class BinaryDictionaryWriter {
private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
private int[] targetMap = new int[8192];
private int[] targetMapOffsets = new int[8192];
private final List<String> posDict = new ArrayList<String>();
private final Map<String,Integer> posDictLookup = new HashMap<String,Integer>();
private final List<String> inflDict = new ArrayList<String>();
private final Map<String,Integer> inflDictLookup = new HashMap<String,Integer>();
private final ArrayList<String> posDict = new ArrayList<String>();
public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
this.implClazz = implClazz;
@ -78,35 +69,20 @@ public abstract class BinaryDictionaryWriter {
sb.append(part);
}
}
String pos = sb.toString();
Integer posIndex = posDictLookup.get(pos);
if (posIndex == null) {
posIndex = posDict.size();
posDict.add(pos);
posDictLookup.put(pos, posIndex);
assert posDict.size() == posDictLookup.size();
}
String posData = sb.toString();
sb.setLength(0);
sb.append(CSVUtil.quoteEscape(entry[8]));
sb.append(CSVUtil.quoteEscape(posData));
sb.append(',');
sb.append(CSVUtil.quoteEscape(entry[9]));
String inflData = sb.toString();
Integer inflIndex = Integer.MAX_VALUE;
int hasInflData;
if ("*,*".equals(inflData)) {
hasInflData = 0; // no inflection data
} else {
hasInflData = 1;
inflIndex = inflDictLookup.get(inflData);
if (inflIndex == null) {
inflIndex = inflDict.size();
inflDict.add(inflData);
inflDictLookup.put(inflData, inflIndex);
assert inflDict.size() == inflDictLookup.size();
}
if (!"*".equals(entry[8])) {
sb.append(CSVUtil.quoteEscape(entry[8]));
}
sb.append(',');
if (!"*".equals(entry[9])) {
sb.append(CSVUtil.quoteEscape(entry[9]));
}
String fullPOSData = sb.toString();
String baseForm = entry[10];
String reading = entry[11];
@ -114,28 +90,40 @@ public abstract class BinaryDictionaryWriter {
// extend buffer if necessary
int left = buffer.remaining();
// worst case: three short, 4 bytes, one vint and features (all as utf-16)
int worstCase = 6 + 4 + 2 + 2*(baseForm.length() + reading.length() + pronunciation.length());
// worst case: two short, 3 bytes, and features (all as utf-16)
int worstCase = 4 + 3 + 2*(baseForm.length() + reading.length() + pronunciation.length());
if (worstCase > left) {
ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1));
buffer.flip();
newBuffer.put(buffer);
buffer = newBuffer;
}
int flags = 0;
if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
flags |= BinaryDictionary.HAS_BASEFORM;
}
if (!pronunciation.equals(reading)) {
flags |= BinaryDictionary.HAS_PRONUNCIATION;
}
assert leftId == rightId;
assert leftId < 8192; // there are still unused bits
// add pos mapping
int toFill = 1+leftId - posDict.size();
for (int i = 0; i < toFill; i++) {
posDict.add(null);
}
buffer.putShort(leftId);
buffer.putShort(rightId);
String existing = posDict.get(leftId);
assert existing == null || existing.equals(fullPOSData);
posDict.set(leftId, fullPOSData);
buffer.putShort((short)(leftId << 2 | flags));
buffer.putShort(wordCost);
assert posIndex.intValue() < 128;
buffer.put((byte) (posIndex.intValue() << 1 | hasInflData));
int pronunciationIsReading = pronunciation.equals(reading) ? 1 : 0;
if ("*".equals(baseForm) || baseForm.equals(entry[0])) {
buffer.put((byte)pronunciationIsReading); // base form is the same as surface form
} else {
assert baseForm.length() < 128;
buffer.put((byte)(baseForm.length() << 1 | pronunciationIsReading));
if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
buffer.put((byte) baseForm.length());
for (int i = 0; i < baseForm.length(); i++) {
buffer.putChar(baseForm.charAt(i));
}
@ -151,7 +139,7 @@ public abstract class BinaryDictionaryWriter {
}
}
if (pronunciationIsReading == 0) {
if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
if (isKatakana(pronunciation)) {
buffer.put((byte) (pronunciation.length() << 1 | 1));
writeKatakana(pronunciation);
@ -163,17 +151,6 @@ public abstract class BinaryDictionaryWriter {
}
}
if (hasInflData > 0) {
int key = inflIndex.intValue();
assert key < 32768; // note there are really like 300 of these...
if (key < 128) {
buffer.put((byte) key);
} else {
buffer.put((byte) ((key & 0x7f) | 0x80));
buffer.put((byte) (key >>> 7));
}
}
return buffer.position();
}
@ -229,7 +206,6 @@ public abstract class BinaryDictionaryWriter {
writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
writeInflDict(baseName + BinaryDictionary.INFLDICT_FILENAME_SUFFIX);
}
// TODO: maybe this int[] should instead be the output to the FST...
@ -271,26 +247,17 @@ public abstract class BinaryDictionaryWriter {
CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
out.writeVInt(posDict.size());
for (String s : posDict) {
out.writeString(s);
}
} finally {
os.close();
}
}
protected void writeInflDict(String filename) throws IOException {
new File(filename).getParentFile().mkdirs();
OutputStream os = new FileOutputStream(filename);
try {
os = new BufferedOutputStream(os);
final DataOutput out = new OutputStreamDataOutput(os);
CodecUtil.writeHeader(out, BinaryDictionary.INFLDICT_HEADER, BinaryDictionary.VERSION);
out.writeVInt(inflDict.size());
for (String s : inflDict) {
String data[] = CSVUtil.parse(s);
assert data.length == 2 : "malformed inflection: " + s;
out.writeString(data[0]);
out.writeString(data[1]);
if (s == null) {
out.writeByte((byte)0);
out.writeByte((byte)0);
out.writeByte((byte)0);
} else {
String data[] = CSVUtil.parse(s);
assert data.length == 3 : "malformed pos/inflection: " + s;
out.writeString(data[0]);
out.writeString(data[1]);
out.writeString(data[2]);
}
}
} finally {
os.close();
@ -313,59 +280,4 @@ public abstract class BinaryDictionaryWriter {
os.close();
}
}
// TODO: the below is messy, but makes the dictionary smaller.
// we track frequencies of inflections so the highest-freq ones have smaller indexes.
/** optional: notes inflection seen in the data up front */
public void noteInflection(String entry[]) {
StringBuilder sb = new StringBuilder();
sb.append(CSVUtil.quoteEscape(entry[8]));
sb.append(',');
sb.append(CSVUtil.quoteEscape(entry[9]));
String s = sb.toString();
if ("*,*".equals(s)) {
return; // no inflection data
}
Integer freq = notedInflections.get(s);
if (freq == null) {
freq = 0;
}
notedInflections.put(s, freq+1);
}
/** prepopulates inflection mapping by frequency */
public void finalizeInflections() {
InflectionAndFreq freqs[] = new InflectionAndFreq[notedInflections.size()];
int upto = 0;
for (Map.Entry<String,Integer> e : notedInflections.entrySet()) {
freqs[upto++] = new InflectionAndFreq(e.getKey(), e.getValue());
}
Arrays.sort(freqs, Collections.reverseOrder());
for (int i = 0; i < upto; i++) {
inflDict.add(freqs[i].inflection);
inflDictLookup.put(freqs[i].inflection, i);
}
}
static class InflectionAndFreq implements Comparable<InflectionAndFreq> {
String inflection;
int freq;
InflectionAndFreq(String s, int i) {
this.inflection = s;
this.freq = i;
}
public int compareTo(InflectionAndFreq other) {
int cmp = freq - other.freq;
if (cmp == 0) {
return inflection.compareTo(other.inflection);
} else {
return cmp;
}
}
}
private HashMap<String,Integer> notedInflections = new HashMap<String,Integer>();
}

View File

@ -101,7 +101,6 @@ public class TokenInfoDictionaryBuilder {
}
String[] formatted = formatEntry(entry);
dictionary.noteInflection(formatted);
lines.add(formatted);
// NFKC normalize dictionary entry
@ -115,14 +114,11 @@ public class TokenInfoDictionaryBuilder {
}
formatted = formatEntry(normalizedEntry);
dictionary.noteInflection(formatted);
lines.add(formatted);
}
}
}
dictionary.finalizeInflections();
System.out.println(" sort...");
// sort by term: we sorted the files already and use a stable sort.

View File

@ -33,7 +33,7 @@ import java.util.List;
import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
public class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*,*,*";
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
private String encoding = "euc-jp";
@ -73,9 +73,7 @@ public class UnknownDictionaryBuilder {
// even though the unknown dictionary returns hardcoded null here.
final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
lines.add(parsed);
dictionary.noteInflection(parsed); // for completeness; I think unk.def has no inflections...
}
dictionary.finalizeInflections(); // should also be no-op
Collections.sort(lines, new Comparator<String[]>() {
public int compare(String[] left, String[] right) {