add simpletext sorted_set

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4765@1446977 13f79535-47bb-0310-9956-ffa450edef68
2013-02-16 23:09:19 +00:00 · 2013-02-16 23:09:19 +00:00 · 5a1a9a27c0
parent a74122ef64
commit 5a1a9a27c0
4 changed files with 211 additions and 8 deletions
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java
@ -82,6 +82,31 @@ import org.apache.lucene.index.SegmentWriteState;
 *  so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
 *  a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
 *  an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
 *  
 *  for sorted set this is a fixed-width file very similar to the SORTED case, for example:
 *  <pre>
 *  field myField
 *    type SORTED_SET
 *    numvalues 10
 *    maxLength 8
 *    pattern 0
 *    ordpattern XXXXX
 *  length 6
 *  foobar[space][space]
 *  length 3
 *  baz[space][space][space][space][space]
 *  ...
 *  0,3,5   
 *  1,2
 *  
 *  10
 *  ...
 *  </pre>
 *  so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
 *  a document's ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
 *  this is a comma-separated list, and its padded with spaces to be fixed width. so trim() and split() it.
 *  and beware the empty string!
 *  an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
 *   
 *  the reader can just scan this file when it opens, skipping over the data blocks
 *  and saving the offset/etc for each field. 
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
@ -60,7 +60,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
    int maxLength;
    boolean fixedLength;
    long minValue;
-    int numValues;
+    long numValues;
  };
  final int maxDoc;
@ -110,10 +110,10 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
        field.pattern = stripPrefix(PATTERN);
        field.dataStartFilePointer = data.getFilePointer();
        data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength) * maxDoc);
-      } else if (dvType == DocValuesType.SORTED) {
+      } else if (dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) {
        readLine();
        assert startsWith(NUMVALUES);
-        field.numValues = Integer.parseInt(stripPrefix(NUMVALUES));
+        field.numValues = Long.parseLong(stripPrefix(NUMVALUES));
        readLine();
        assert startsWith(MAXLENGTH);
        field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
@ -280,14 +280,87 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
      @Override
      public int getValueCount() {
-        return field.numValues;
+        return (int)field.numValues;
      }
    };
  }
  @Override
-  public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
+  public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
-    throw new UnsupportedOperationException(); // nocommit
+    final OneField field = fields.get(fieldInfo.name);
    // SegmentCoreReaders already verifies this field is
    // valid:
    assert field != null;
    final IndexInput in = data.clone();
    final BytesRef scratch = new BytesRef();
    final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
    return new SortedSetDocValues() {
      String[] currentOrds = new String[0];
      int currentIndex = 0;
      @Override
      public long nextOrd() {
        if (currentIndex == currentOrds.length) {
          return NO_MORE_ORDS;
        } else {
          return Long.parseLong(currentOrds[currentIndex++]);
        }
      }
      @Override
      public void setDocument(int docID) {
        if (docID < 0 || docID >= maxDoc) {
          throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
        }
        try {
          in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + docID * (1 + field.ordPattern.length()));
          SimpleTextUtil.readLine(in, scratch);
          String ordList = scratch.utf8ToString().trim();
          if (ordList.isEmpty()) {
            currentOrds = new String[0];
          } else {
            currentOrds = ordList.split(",");
          }
          currentIndex = 0;
        } catch (IOException ioe) {
          throw new RuntimeException(ioe);
        }
      }
      @Override
      public void lookupOrd(long ord, BytesRef result) {
        try {
          if (ord < 0 || ord >= field.numValues) {
            throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord);
          }
          in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
          SimpleTextUtil.readLine(in, scratch);
          assert StringHelper.startsWith(scratch, LENGTH): "got " + scratch.utf8ToString() + " in=" + in;
          int len;
          try {
            len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
          } catch (ParseException pe) {
            CorruptIndexException e = new CorruptIndexException("failed to parse int length");
            e.initCause(pe);
            throw e;
          }
          result.bytes = new byte[len];
          result.offset = 0;
          result.length = len;
          in.readBytes(result.bytes, 0, len);
        } catch (IOException ioe) {
          throw new RuntimeException(ioe);
        }
      }
      @Override
      public long getValueCount() {
        return field.numValues;
      }
    };
  }
  @Override
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java
@ -22,6 +22,7 @@ import java.math.BigInteger;
 import java.text.DecimalFormat;
 import java.text.DecimalFormatSymbols;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Locale;
 import java.util.Set;
@ -252,7 +253,111 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
  @Override
  public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
-    throw new UnsupportedOperationException(); // nocommit
+    assert fieldSeen(field.name);
    assert field.getDocValuesType() == DocValuesType.SORTED_SET;
    writeFieldEntry(field, FieldInfo.DocValuesType.SORTED_SET);
    long valueCount = 0;
    int maxLength = 0;
    for(BytesRef value : values) {
      maxLength = Math.max(maxLength, value.length);
      valueCount++;
    }
    // write numValues
    SimpleTextUtil.write(data, NUMVALUES);
    SimpleTextUtil.write(data, Long.toString(valueCount), scratch);
    SimpleTextUtil.writeNewline(data);
    // write maxLength
    SimpleTextUtil.write(data, MAXLENGTH);
    SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
    SimpleTextUtil.writeNewline(data);
    int maxBytesLength = Integer.toString(maxLength).length();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < maxBytesLength; i++) {
      sb.append('0');
    }
    // write our pattern for encoding lengths
    SimpleTextUtil.write(data, PATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
    // compute ord pattern: this is funny, we encode all values for all docs to find the maximum length
    int maxOrdListLength = 0;
    StringBuilder sb2 = new StringBuilder();
    Iterator<Number> ordStream = ords.iterator();
    for (Number n : docToOrdCount) {
      sb2.setLength(0);
      int count = n.intValue();
      for (int i = 0; i < count; i++) {
        long ord = ordStream.next().longValue();
        if (sb2.length() > 0) {
          sb2.append(",");
        }
        sb2.append(Long.toString(ord));
      }
      maxOrdListLength = Math.max(maxOrdListLength, sb2.length());
    }
    sb2.setLength(0);
    for (int i = 0; i < maxOrdListLength; i++) {
      sb2.append('X');
    }
    // write our pattern for ord lists
    SimpleTextUtil.write(data, ORDPATTERN);
    SimpleTextUtil.write(data, sb2.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    // for asserts:
    long valuesSeen = 0;
    for(BytesRef value : values) {
      // write length
      SimpleTextUtil.write(data, LENGTH);
      SimpleTextUtil.write(data, encoder.format(value.length), scratch);
      SimpleTextUtil.writeNewline(data);
      // write bytes -- don't use SimpleText.write
      // because it escapes:
      data.writeBytes(value.bytes, value.offset, value.length);
      // pad to fit
      for (int i = value.length; i < maxLength; i++) {
        data.writeByte((byte)' ');
      }
      SimpleTextUtil.writeNewline(data);
      valuesSeen++;
      assert valuesSeen <= valueCount;
    }
    assert valuesSeen == valueCount;
    ordStream = ords.iterator();
    // write the ords for each doc comma-separated
    for(Number n : docToOrdCount) {
      sb2.setLength(0);
      int count = n.intValue();
      for (int i = 0; i < count; i++) {
        long ord = ordStream.next().longValue();
        if (sb2.length() > 0) {
          sb2.append(",");
        }
        sb2.append(Long.toString(ord));
      }
      // now pad to fit: these are numbers so spaces work well. reader calls trim()
      int numPadding = maxOrdListLength - sb2.length();
      for (int i = 0; i < numPadding; i++) {
        sb2.append(' ');
      }
      SimpleTextUtil.write(data, sb2.toString(), scratch);
      SimpleTextUtil.writeNewline(data);
    }
  }
  /** write the header for this field */
--- a/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java
+++ b/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java
@ -55,7 +55,7 @@ import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
 */
 // nocommit: should only be Lucene40 and Lucene41
 // nocommit: move to BaseDocValuesTestCase, but allow these to be assume()d (for 4.0 and 4.1)
-@SuppressCodecs({ "Lucene40", "Lucene41", "SimpleText" })
+@SuppressCodecs({ "Lucene40", "Lucene41" })
 public class TestDemoDocValue extends LuceneTestCase {
  public void testSortedSetOneValue() throws IOException {