LUCENE-3699: simplify dictionary access and reduce tokeninfodictionary 1.5MB

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232120 13f79535-47bb-0310-9956-ffa450edef68
2012-01-16 19:19:48 +00:00 · 2012-01-16 19:19:48 +00:00 · 12c9b8b4bf
parent 297cc4a275
commit 12c9b8b4bf
12 changed files with 94 additions and 218 deletions
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
@ -37,12 +37,10 @@ public abstract class BinaryDictionary implements Dictionary {
  public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
  public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
  public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
-  public static final String INFLDICT_FILENAME_SUFFIX = "$inflDict.dat";
  
  public static final String DICT_HEADER = "kuromoji_dict";
  public static final String TARGETMAP_HEADER = "kuromoji_dict_map";
  public static final String POSDICT_HEADER = "kuromoji_dict_pos";
-  public static final String INFLDICT_HEADER = "kuromoji_dict_infl";
  public static final int VERSION = 1;
  
  private final ByteBuffer buffer;
@ -52,7 +50,7 @@ public abstract class BinaryDictionary implements Dictionary {
  private final String[] inflFormDict;
  
  protected BinaryDictionary() throws IOException {
-    InputStream mapIS = null, dictIS = null, posIS = null, inflIS = null;
+    InputStream mapIS = null, dictIS = null, posIS = null;
    IOException priorE = null;
    int[] targetMapOffsets = null, targetMap = null;
    String[] posDict = null;
@ -85,25 +83,24 @@ public abstract class BinaryDictionary implements Dictionary {
      posIS = new BufferedInputStream(posIS);
      in = new InputStreamDataInput(posIS);
      CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
-      posDict = new String[in.readVInt()];
-      for (int j = 0; j < posDict.length; j++) {
+      int posSize = in.readVInt();
+      posDict = new String[posSize];
+      inflTypeDict = new String[posSize];
+      inflFormDict = new String[posSize];
+      for (int j = 0; j < posSize; j++) {
        posDict[j] = in.readString();
+        inflTypeDict[j] = in.readString();
+        inflFormDict[j] = in.readString();
+        // this is how we encode null inflections
+        if (inflTypeDict[j].length() == 0) {
+          inflTypeDict[j] = null;
+        }
+        if (inflFormDict[j].length() == 0) {
+          inflFormDict[j] = null;
+        }
      }
      posIS.close(); posIS = null;
      
-      inflIS = getResource(INFLDICT_FILENAME_SUFFIX);
-      inflIS = new BufferedInputStream(inflIS);
-      in = new InputStreamDataInput(inflIS);
-      CodecUtil.checkHeader(in, INFLDICT_HEADER, VERSION, VERSION);
-      int length = in.readVInt();
-      inflTypeDict = new String[length];
-      inflFormDict = new String[length];
-      for (int j = 0; j < length; j++) {
-        inflTypeDict[j] = in.readString();
-        inflFormDict[j] = in.readString();
-      }
-      inflIS.close(); inflIS = null;
-
      dictIS = getResource(DICT_FILENAME_SUFFIX);
      // no buffering here, as we load in one large buffer
      in = new InputStreamDataInput(dictIS);
@ -120,7 +117,7 @@ public abstract class BinaryDictionary implements Dictionary {
    } catch (IOException ioe) {
      priorE = ioe;
    } finally {
-      IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, inflIS, dictIS);
+      IOUtils.closeWhileHandlingException(priorE, mapIS, posIS, dictIS);
    }
    
    this.targetMap = targetMap;
@ -152,27 +149,27 @@ public abstract class BinaryDictionary implements Dictionary {
  
  @Override	
  public int getLeftId(int wordId) {
-    return buffer.getShort(wordId);
+    return buffer.getShort(wordId) >>> 2;
  }
  
  @Override
  public int getRightId(int wordId) {
-    return buffer.getShort(wordId + 2);	// Skip left id
+    return buffer.getShort(wordId) >>> 2;
  }
  
  @Override
  public int getWordCost(int wordId) {
-    return buffer.getShort(wordId + 4);	// Skip left id and right id
+    return buffer.getShort(wordId + 2);	// Skip id
  }

  @Override
  public String getBaseForm(int wordId) {
-    int offset = baseFormOffset(wordId);
-    int length = (buffer.get(offset++) & 0xff) >>> 1;
-    if (length == 0) {
-      return null; // same as surface form
-    } else {
+    if (hasBaseFormData(wordId)) {
+      int offset = baseFormOffset(wordId);
+      int length = buffer.get(offset++) & 0xff;
      return readString(offset, length, false);
+    } else {
+      return null;
    }
  }
  
@ -185,8 +182,7 @@ public abstract class BinaryDictionary implements Dictionary {
  
  @Override
  public String getPartOfSpeech(int wordId) {
-    int posIndex = buffer.get(posOffset(wordId)) & 0xff; // read index into posDict
-    return posDict[posIndex >>> 1];
+    return posDict[getLeftId(wordId)];
  }
  
  @Override
@ -202,28 +198,26 @@ public abstract class BinaryDictionary implements Dictionary {
  
  @Override
  public String getInflectionType(int wordId) {
-    int index = getInflectionIndex(wordId);
-    return index < 0 ? null : inflTypeDict[index];
+    return inflTypeDict[getLeftId(wordId)];
  }

  @Override
  public String getInflectionForm(int wordId) {
-    int index = getInflectionIndex(wordId);
-    return index < 0 ? null : inflFormDict[index];
-  }
-  
-  private static int posOffset(int wordId) {
-    return wordId + 6;
+    return inflFormDict[getLeftId(wordId)];
  }
  
  private static int baseFormOffset(int wordId) {
-    return wordId + 7;
+    return wordId + 4;
  }
  
  private int readingOffset(int wordId) {
    int offset = baseFormOffset(wordId);
-    int baseFormLength = buffer.get(offset++) & 0xfe; // mask away pronunciation bit
-    return offset + baseFormLength;
+    if (hasBaseFormData(wordId)) {
+      int baseFormLength = buffer.get(offset++) & 0xff;
+      return offset + (baseFormLength << 1);
+    } else {
+      return offset;
+    }
  }
  
  private int pronunciationOffset(int wordId) {
@ -238,41 +232,12 @@ public abstract class BinaryDictionary implements Dictionary {
    return offset + readingLength;
  }
  
+  private boolean hasBaseFormData(int wordId) {
+    return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
+  }
+  
  private boolean hasPronunciationData(int wordId) {
-    int baseFormData = buffer.get(baseFormOffset(wordId)) & 0xff;
-    return (baseFormData & 1) == 0;
-  }
-  
-  private boolean hasInflectionData(int wordId) {
-    int posData = buffer.get(posOffset(wordId)) & 0xff;
-    return (posData & 1) == 1;
-  }
-  
-  private int getInflectionIndex(int wordId) {
-    if (!hasInflectionData(wordId)) {
-      return -1; // common case: no inflection data
-    }
-    
-    // skip past reading/pronunciation at the end
-    int offset = hasPronunciationData(wordId) ? pronunciationOffset(wordId) : readingOffset(wordId);
-    int endData = buffer.get(offset++) & 0xff;
-    
-    final int endLength;
-    if ((endData & 1) == 0) {
-      endLength = endData & 0xfe; // UTF-16: mask off kana bit
-    } else {
-      endLength = endData >>> 1;
-    }
-    
-    offset += endLength;
-    
-    byte b = buffer.get(offset++);
-    int i = b & 0x7F;
-    if ((b & 0x80) == 0) return i;
-    b = buffer.get(offset++);
-    i |= (b & 0x7F) << 7;
-    assert ((b & 0x80) == 0);
-    return i;
+    return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
  }
  
  private String readString(int offset, int length, boolean kana) {
@ -288,4 +253,9 @@ public abstract class BinaryDictionary implements Dictionary {
    }
    return new String(text);
  }
+  
+  /** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
+  public static final int HAS_BASEFORM = 1;
+  /** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
+  public static final int HAS_PRONUNCIATION = 2;
 }
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
--- a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
+++ b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
--- a/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
+++ b/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
@ -26,11 +26,6 @@ import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-import java.util.HashMap;
-import java.util.Map;

 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.store.OutputStreamDataOutput;
@ -45,11 +40,7 @@ public abstract class BinaryDictionaryWriter {
  private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
  private int[] targetMap = new int[8192];
  private int[] targetMapOffsets = new int[8192];
-  private final List<String> posDict = new ArrayList<String>();
-  private final Map<String,Integer> posDictLookup = new HashMap<String,Integer>();
-  
-  private final List<String> inflDict = new ArrayList<String>();
-  private final Map<String,Integer> inflDictLookup = new HashMap<String,Integer>();
+  private final ArrayList<String> posDict = new ArrayList<String>();

  public BinaryDictionaryWriter(Class<? extends BinaryDictionary> implClazz, int size) {
    this.implClazz = implClazz;
@ -78,35 +69,20 @@ public abstract class BinaryDictionaryWriter {
        sb.append(part);
      }
    }
-    String pos = sb.toString();
-    Integer posIndex = posDictLookup.get(pos);
-    if (posIndex == null) {
-      posIndex = posDict.size();
-      posDict.add(pos);
-      posDictLookup.put(pos, posIndex);
-      assert posDict.size() == posDictLookup.size();
-    }
+    
+    String posData = sb.toString();
    
    sb.setLength(0);
-    sb.append(CSVUtil.quoteEscape(entry[8]));
+    sb.append(CSVUtil.quoteEscape(posData));
    sb.append(',');
-    sb.append(CSVUtil.quoteEscape(entry[9]));
-    String inflData = sb.toString();
-    
-    Integer inflIndex = Integer.MAX_VALUE;
-    int hasInflData;
-    if ("*,*".equals(inflData)) {
-      hasInflData = 0; // no inflection data
-    } else {
-      hasInflData = 1;
-      inflIndex = inflDictLookup.get(inflData);
-      if (inflIndex == null) {
-        inflIndex = inflDict.size();
-        inflDict.add(inflData);
-        inflDictLookup.put(inflData, inflIndex);
-        assert inflDict.size() == inflDictLookup.size();
-      }
+    if (!"*".equals(entry[8])) {
+      sb.append(CSVUtil.quoteEscape(entry[8]));
    }
+    sb.append(',');
+    if (!"*".equals(entry[9])) {
+      sb.append(CSVUtil.quoteEscape(entry[9]));
+    }
+    String fullPOSData = sb.toString();
    
    String baseForm = entry[10];
    String reading = entry[11];
@ -114,28 +90,40 @@ public abstract class BinaryDictionaryWriter {
    
    // extend buffer if necessary
    int left = buffer.remaining();
-    // worst case: three short, 4 bytes, one vint and features (all as utf-16)
-    int worstCase = 6 + 4 + 2 + 2*(baseForm.length() + reading.length() + pronunciation.length());
+    // worst case: two short, 3 bytes, and features (all as utf-16)
+    int worstCase = 4 + 3 + 2*(baseForm.length() + reading.length() + pronunciation.length());
    if (worstCase > left) {
      ByteBuffer newBuffer = ByteBuffer.allocate(ArrayUtil.oversize(buffer.limit() + worstCase - left, 1));
      buffer.flip();
      newBuffer.put(buffer);
      buffer = newBuffer;
    }
+
+    int flags = 0;
+    if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
+      flags |= BinaryDictionary.HAS_BASEFORM;
+    }
+    if (!pronunciation.equals(reading)) {
+      flags |= BinaryDictionary.HAS_PRONUNCIATION;
+    }
+
+    assert leftId == rightId;
+    assert leftId < 8192; // there are still unused bits
+    // add pos mapping
+    int toFill = 1+leftId - posDict.size();
+    for (int i = 0; i < toFill; i++) {
+      posDict.add(null);
+    }
    
-    buffer.putShort(leftId);
-    buffer.putShort(rightId);
+    String existing = posDict.get(leftId);
+    assert existing == null || existing.equals(fullPOSData);
+    posDict.set(leftId, fullPOSData);
+    
+    buffer.putShort((short)(leftId << 2 | flags));
    buffer.putShort(wordCost);
-    assert posIndex.intValue() < 128;
-    buffer.put((byte) (posIndex.intValue() << 1 | hasInflData));
-    
-    int pronunciationIsReading = pronunciation.equals(reading) ? 1 : 0;
-    
-    if ("*".equals(baseForm) || baseForm.equals(entry[0])) {
-      buffer.put((byte)pronunciationIsReading); // base form is the same as surface form
-    } else {
-      assert baseForm.length() < 128;
-      buffer.put((byte)(baseForm.length() << 1 | pronunciationIsReading));
+
+    if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
+      buffer.put((byte) baseForm.length());
      for (int i = 0; i < baseForm.length(); i++) {
        buffer.putChar(baseForm.charAt(i));
      }
@ -151,7 +139,7 @@ public abstract class BinaryDictionaryWriter {
      }
    }
    
-    if (pronunciationIsReading == 0) {
+    if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
      if (isKatakana(pronunciation)) {
        buffer.put((byte) (pronunciation.length() << 1 | 1));
        writeKatakana(pronunciation);
@ -163,17 +151,6 @@ public abstract class BinaryDictionaryWriter {
      }
    }
    
-    if (hasInflData > 0) {
-      int key = inflIndex.intValue();
-      assert key < 32768; // note there are really like 300 of these...
-      if (key < 128) {
-        buffer.put((byte) key);
-      } else {
-        buffer.put((byte) ((key & 0x7f) | 0x80));
-        buffer.put((byte) (key >>> 7));
-      }
-    }
-    
    return buffer.position();
  }
  
@ -229,7 +206,6 @@ public abstract class BinaryDictionaryWriter {
    writeDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
    writeTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
    writePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
-    writeInflDict(baseName + BinaryDictionary.INFLDICT_FILENAME_SUFFIX);
  }
  
  // TODO: maybe this int[] should instead be the output to the FST...
@ -271,26 +247,17 @@ public abstract class BinaryDictionaryWriter {
      CodecUtil.writeHeader(out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION);
      out.writeVInt(posDict.size());
      for (String s : posDict) {
-        out.writeString(s);
-      }
-    } finally {
-      os.close();
-    }
-  }
-  
-  protected void writeInflDict(String filename) throws IOException {
-    new File(filename).getParentFile().mkdirs();
-    OutputStream os = new FileOutputStream(filename);
-    try {
-      os = new BufferedOutputStream(os);
-      final DataOutput out = new OutputStreamDataOutput(os);
-      CodecUtil.writeHeader(out, BinaryDictionary.INFLDICT_HEADER, BinaryDictionary.VERSION);
-      out.writeVInt(inflDict.size());
-      for (String s : inflDict) {
-        String data[] = CSVUtil.parse(s);
-        assert data.length == 2 : "malformed inflection: " + s;
-        out.writeString(data[0]);
-        out.writeString(data[1]);
+        if (s == null) {
+          out.writeByte((byte)0);
+          out.writeByte((byte)0);
+          out.writeByte((byte)0);
+        } else {
+          String data[] = CSVUtil.parse(s);
+          assert data.length == 3 : "malformed pos/inflection: " + s;
+          out.writeString(data[0]);
+          out.writeString(data[1]);
+          out.writeString(data[2]);
+        }
      }
    } finally {
      os.close();
@ -313,59 +280,4 @@ public abstract class BinaryDictionaryWriter {
      os.close();
    }
  }
-  
-  // TODO: the below is messy, but makes the dictionary smaller.
-  // we track frequencies of inflections so the highest-freq ones have smaller indexes.
-
-  /** optional: notes inflection seen in the data up front */
-  public void noteInflection(String entry[]) {
-    StringBuilder sb = new StringBuilder();
-    sb.append(CSVUtil.quoteEscape(entry[8]));
-    sb.append(',');
-    sb.append(CSVUtil.quoteEscape(entry[9]));
-    String s = sb.toString();
-    if ("*,*".equals(s)) {
-      return; // no inflection data
-    }
-    Integer freq = notedInflections.get(s);
-    if (freq == null) {
-      freq = 0;
-    }
-    notedInflections.put(s, freq+1);
-  }
-  
-  /** prepopulates inflection mapping by frequency */
-  public void finalizeInflections() {
-    InflectionAndFreq freqs[] = new InflectionAndFreq[notedInflections.size()];
-    int upto = 0;
-    for (Map.Entry<String,Integer> e : notedInflections.entrySet()) {
-      freqs[upto++] = new InflectionAndFreq(e.getKey(), e.getValue());
-    }
-    Arrays.sort(freqs, Collections.reverseOrder());
-    for (int i = 0; i < upto; i++) {
-      inflDict.add(freqs[i].inflection);
-      inflDictLookup.put(freqs[i].inflection, i);
-    }
-  }
-  
-  static class InflectionAndFreq implements Comparable<InflectionAndFreq> {
-    String inflection;
-    int freq;
-    
-    InflectionAndFreq(String s, int i) {
-      this.inflection = s;
-      this.freq = i;
-    }
-    
-    public int compareTo(InflectionAndFreq other) {
-      int cmp = freq - other.freq;
-      if (cmp == 0) {
-        return inflection.compareTo(other.inflection);
-      } else {
-        return cmp;
-      }
-    }
-  }
-  
-  private HashMap<String,Integer> notedInflections = new HashMap<String,Integer>();
 }
--- a/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
+++ b/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
@ -101,7 +101,6 @@ public class TokenInfoDictionaryBuilder {
        }
        
        String[] formatted = formatEntry(entry);
-        dictionary.noteInflection(formatted);
        lines.add(formatted);
        
        // NFKC normalize dictionary entry
@ -115,14 +114,11 @@ public class TokenInfoDictionaryBuilder {
          }
          
          formatted = formatEntry(normalizedEntry);
-          dictionary.noteInflection(formatted);
          lines.add(formatted);
        }
      }
    }
    
-    dictionary.finalizeInflections();
-    
    System.out.println("  sort...");

    // sort by term: we sorted the files already and use a stable sort.
--- a/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
+++ b/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/UnknownDictionaryBuilder.java
@ -33,7 +33,7 @@ import java.util.List;
 import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;

 public class UnknownDictionaryBuilder {
-  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,-,*,*,*,*,*,*,*,*";
+  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
  
  private String encoding = "euc-jp";
  
@ -73,9 +73,7 @@ public class UnknownDictionaryBuilder {
      // even though the unknown dictionary returns hardcoded null here.
      final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
      lines.add(parsed);
-      dictionary.noteInflection(parsed); // for completeness; I think unk.def has no inflections...
    }
-    dictionary.finalizeInflections(); // should also be no-op
    
    Collections.sort(lines, new Comparator<String[]>() {
      public int compare(String[] left, String[] right) {