diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java index d692705fdad..02557c95b57 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java @@ -82,6 +82,31 @@ import org.apache.lucene.index.SegmentWriteState; * so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. * a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid * an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord + * + * for sorted set this is a fixed-width file very similar to the SORTED case, for example: + *
+ *  field myField
+ *    type SORTED_SET
+ *    numvalues 10
+ *    maxLength 8
+ *    pattern 0
+ *    ordpattern XXXXX
+ *  length 6
+ *  foobar[space][space]
+ *  length 3
+ *  baz[space][space][space][space][space]
+ *  ...
+ *  0,3,5   
+ *  1,2
+ *  
+ *  10
+ *  ...
+ *  
+ * so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. + * a document's ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid + * this is a comma-separated list, and its padded with spaces to be fixed width. so trim() and split() it. + * and beware the empty string! + * an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord * * the reader can just scan this file when it opens, skipping over the data blocks * and saving the offset/etc for each field. diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java index 5ebcd316599..31d40097f76 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java @@ -60,7 +60,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer { int maxLength; boolean fixedLength; long minValue; - int numValues; + long numValues; }; final int maxDoc; @@ -110,10 +110,10 @@ class SimpleTextDocValuesReader extends DocValuesProducer { field.pattern = stripPrefix(PATTERN); field.dataStartFilePointer = data.getFilePointer(); data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength) * maxDoc); - } else if (dvType == DocValuesType.SORTED) { + } else if (dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) { readLine(); assert startsWith(NUMVALUES); - field.numValues = Integer.parseInt(stripPrefix(NUMVALUES)); + field.numValues = Long.parseLong(stripPrefix(NUMVALUES)); readLine(); assert startsWith(MAXLENGTH); field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH)); @@ -280,14 +280,87 @@ class SimpleTextDocValuesReader extends DocValuesProducer { @Override public int getValueCount() { - return field.numValues; + return (int)field.numValues; } }; } @Override - public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { - throw new UnsupportedOperationException(); // nocommit + public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException { + final OneField field = fields.get(fieldInfo.name); + + // SegmentCoreReaders already verifies this field is + // valid: + assert field != null; + + final IndexInput in = data.clone(); + final BytesRef scratch = new BytesRef(); + final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT)); + + return new SortedSetDocValues() { + String[] currentOrds = new String[0]; + int currentIndex = 0; + + @Override + public long nextOrd() { + if (currentIndex == currentOrds.length) { + return NO_MORE_ORDS; + } else { + return Long.parseLong(currentOrds[currentIndex++]); + } + } + + @Override + public void setDocument(int docID) { + if (docID < 0 || docID >= maxDoc) { + throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID); + } + try { + in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + docID * (1 + field.ordPattern.length())); + SimpleTextUtil.readLine(in, scratch); + String ordList = scratch.utf8ToString().trim(); + if (ordList.isEmpty()) { + currentOrds = new String[0]; + } else { + currentOrds = ordList.split(","); + } + currentIndex = 0; + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + + @Override + public void lookupOrd(long ord, BytesRef result) { + try { + if (ord < 0 || ord >= field.numValues) { + throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord); + } + in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength)); + SimpleTextUtil.readLine(in, scratch); + assert StringHelper.startsWith(scratch, LENGTH): "got " + scratch.utf8ToString() + " in=" + in; + int len; + try { + len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue(); + } catch (ParseException pe) { + CorruptIndexException e = new CorruptIndexException("failed to parse int length"); + e.initCause(pe); + throw e; + } + result.bytes = new byte[len]; + result.offset = 0; + result.length = len; + in.readBytes(result.bytes, 0, len); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + + @Override + public long getValueCount() { + return field.numValues; + } + }; } @Override diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java index e42ad18a115..2f86255cbd6 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java @@ -22,6 +22,7 @@ import java.math.BigInteger; import java.text.DecimalFormat; import java.text.DecimalFormatSymbols; import java.util.HashSet; +import java.util.Iterator; import java.util.Locale; import java.util.Set; @@ -252,7 +253,111 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { @Override public void addSortedSetField(FieldInfo field, Iterable values, Iterable docToOrdCount, Iterable ords) throws IOException { - throw new UnsupportedOperationException(); // nocommit + assert fieldSeen(field.name); + assert field.getDocValuesType() == DocValuesType.SORTED_SET; + writeFieldEntry(field, FieldInfo.DocValuesType.SORTED_SET); + + long valueCount = 0; + int maxLength = 0; + for(BytesRef value : values) { + maxLength = Math.max(maxLength, value.length); + valueCount++; + } + + // write numValues + SimpleTextUtil.write(data, NUMVALUES); + SimpleTextUtil.write(data, Long.toString(valueCount), scratch); + SimpleTextUtil.writeNewline(data); + + // write maxLength + SimpleTextUtil.write(data, MAXLENGTH); + SimpleTextUtil.write(data, Integer.toString(maxLength), scratch); + SimpleTextUtil.writeNewline(data); + + int maxBytesLength = Integer.toString(maxLength).length(); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < maxBytesLength; i++) { + sb.append('0'); + } + + // write our pattern for encoding lengths + SimpleTextUtil.write(data, PATTERN); + SimpleTextUtil.write(data, sb.toString(), scratch); + SimpleTextUtil.writeNewline(data); + final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT)); + + // compute ord pattern: this is funny, we encode all values for all docs to find the maximum length + int maxOrdListLength = 0; + StringBuilder sb2 = new StringBuilder(); + Iterator ordStream = ords.iterator(); + for (Number n : docToOrdCount) { + sb2.setLength(0); + int count = n.intValue(); + for (int i = 0; i < count; i++) { + long ord = ordStream.next().longValue(); + if (sb2.length() > 0) { + sb2.append(","); + } + sb2.append(Long.toString(ord)); + } + maxOrdListLength = Math.max(maxOrdListLength, sb2.length()); + } + + sb2.setLength(0); + for (int i = 0; i < maxOrdListLength; i++) { + sb2.append('X'); + } + + // write our pattern for ord lists + SimpleTextUtil.write(data, ORDPATTERN); + SimpleTextUtil.write(data, sb2.toString(), scratch); + SimpleTextUtil.writeNewline(data); + + // for asserts: + long valuesSeen = 0; + + for(BytesRef value : values) { + // write length + SimpleTextUtil.write(data, LENGTH); + SimpleTextUtil.write(data, encoder.format(value.length), scratch); + SimpleTextUtil.writeNewline(data); + + // write bytes -- don't use SimpleText.write + // because it escapes: + data.writeBytes(value.bytes, value.offset, value.length); + + // pad to fit + for (int i = value.length; i < maxLength; i++) { + data.writeByte((byte)' '); + } + SimpleTextUtil.writeNewline(data); + valuesSeen++; + assert valuesSeen <= valueCount; + } + + assert valuesSeen == valueCount; + + ordStream = ords.iterator(); + + // write the ords for each doc comma-separated + for(Number n : docToOrdCount) { + sb2.setLength(0); + int count = n.intValue(); + for (int i = 0; i < count; i++) { + long ord = ordStream.next().longValue(); + if (sb2.length() > 0) { + sb2.append(","); + } + sb2.append(Long.toString(ord)); + } + // now pad to fit: these are numbers so spaces work well. reader calls trim() + int numPadding = maxOrdListLength - sb2.length(); + for (int i = 0; i < numPadding; i++) { + sb2.append(' '); + } + SimpleTextUtil.write(data, sb2.toString(), scratch); + SimpleTextUtil.writeNewline(data); + } } /** write the header for this field */ diff --git a/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java b/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java index 44ca4cf6250..d352039f69f 100644 --- a/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java +++ b/lucene/core/src/test/org/apache/lucene/TestDemoDocValue.java @@ -55,7 +55,7 @@ import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; */ // nocommit: should only be Lucene40 and Lucene41 // nocommit: move to BaseDocValuesTestCase, but allow these to be assume()d (for 4.0 and 4.1) -@SuppressCodecs({ "Lucene40", "Lucene41", "SimpleText" }) +@SuppressCodecs({ "Lucene40", "Lucene41" }) public class TestDemoDocValue extends LuceneTestCase { public void testSortedSetOneValue() throws IOException {