mirror of https://github.com/apache/lucene.git
add simpletext sorted_set
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4765@1446977 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a74122ef64
commit
5a1a9a27c0
|
@ -82,6 +82,31 @@ import org.apache.lucene.index.SegmentWriteState;
|
||||||
* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
|
* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
|
||||||
* a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
|
* a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
|
||||||
* an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
|
* an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
|
||||||
|
*
|
||||||
|
* for sorted set this is a fixed-width file very similar to the SORTED case, for example:
|
||||||
|
* <pre>
|
||||||
|
* field myField
|
||||||
|
* type SORTED_SET
|
||||||
|
* numvalues 10
|
||||||
|
* maxLength 8
|
||||||
|
* pattern 0
|
||||||
|
* ordpattern XXXXX
|
||||||
|
* length 6
|
||||||
|
* foobar[space][space]
|
||||||
|
* length 3
|
||||||
|
* baz[space][space][space][space][space]
|
||||||
|
* ...
|
||||||
|
* 0,3,5
|
||||||
|
* 1,2
|
||||||
|
*
|
||||||
|
* 10
|
||||||
|
* ...
|
||||||
|
* </pre>
|
||||||
|
* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
|
||||||
|
* a document's ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
|
||||||
|
* this is a comma-separated list, and its padded with spaces to be fixed width. so trim() and split() it.
|
||||||
|
* and beware the empty string!
|
||||||
|
* an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
|
||||||
*
|
*
|
||||||
* the reader can just scan this file when it opens, skipping over the data blocks
|
* the reader can just scan this file when it opens, skipping over the data blocks
|
||||||
* and saving the offset/etc for each field.
|
* and saving the offset/etc for each field.
|
||||||
|
|
|
@ -60,7 +60,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
|
||||||
int maxLength;
|
int maxLength;
|
||||||
boolean fixedLength;
|
boolean fixedLength;
|
||||||
long minValue;
|
long minValue;
|
||||||
int numValues;
|
long numValues;
|
||||||
};
|
};
|
||||||
|
|
||||||
final int maxDoc;
|
final int maxDoc;
|
||||||
|
@ -110,10 +110,10 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
|
||||||
field.pattern = stripPrefix(PATTERN);
|
field.pattern = stripPrefix(PATTERN);
|
||||||
field.dataStartFilePointer = data.getFilePointer();
|
field.dataStartFilePointer = data.getFilePointer();
|
||||||
data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength) * maxDoc);
|
data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength) * maxDoc);
|
||||||
} else if (dvType == DocValuesType.SORTED) {
|
} else if (dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) {
|
||||||
readLine();
|
readLine();
|
||||||
assert startsWith(NUMVALUES);
|
assert startsWith(NUMVALUES);
|
||||||
field.numValues = Integer.parseInt(stripPrefix(NUMVALUES));
|
field.numValues = Long.parseLong(stripPrefix(NUMVALUES));
|
||||||
readLine();
|
readLine();
|
||||||
assert startsWith(MAXLENGTH);
|
assert startsWith(MAXLENGTH);
|
||||||
field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
|
field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
|
||||||
|
@ -280,14 +280,87 @@ class SimpleTextDocValuesReader extends DocValuesProducer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getValueCount() {
|
public int getValueCount() {
|
||||||
return field.numValues;
|
return (int)field.numValues;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
|
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
|
||||||
throw new UnsupportedOperationException(); // nocommit
|
final OneField field = fields.get(fieldInfo.name);
|
||||||
|
|
||||||
|
// SegmentCoreReaders already verifies this field is
|
||||||
|
// valid:
|
||||||
|
assert field != null;
|
||||||
|
|
||||||
|
final IndexInput in = data.clone();
|
||||||
|
final BytesRef scratch = new BytesRef();
|
||||||
|
final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
|
||||||
|
|
||||||
|
return new SortedSetDocValues() {
|
||||||
|
String[] currentOrds = new String[0];
|
||||||
|
int currentIndex = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long nextOrd() {
|
||||||
|
if (currentIndex == currentOrds.length) {
|
||||||
|
return NO_MORE_ORDS;
|
||||||
|
} else {
|
||||||
|
return Long.parseLong(currentOrds[currentIndex++]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setDocument(int docID) {
|
||||||
|
if (docID < 0 || docID >= maxDoc) {
|
||||||
|
throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + docID * (1 + field.ordPattern.length()));
|
||||||
|
SimpleTextUtil.readLine(in, scratch);
|
||||||
|
String ordList = scratch.utf8ToString().trim();
|
||||||
|
if (ordList.isEmpty()) {
|
||||||
|
currentOrds = new String[0];
|
||||||
|
} else {
|
||||||
|
currentOrds = ordList.split(",");
|
||||||
|
}
|
||||||
|
currentIndex = 0;
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
throw new RuntimeException(ioe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void lookupOrd(long ord, BytesRef result) {
|
||||||
|
try {
|
||||||
|
if (ord < 0 || ord >= field.numValues) {
|
||||||
|
throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord);
|
||||||
|
}
|
||||||
|
in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
|
||||||
|
SimpleTextUtil.readLine(in, scratch);
|
||||||
|
assert StringHelper.startsWith(scratch, LENGTH): "got " + scratch.utf8ToString() + " in=" + in;
|
||||||
|
int len;
|
||||||
|
try {
|
||||||
|
len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
|
||||||
|
} catch (ParseException pe) {
|
||||||
|
CorruptIndexException e = new CorruptIndexException("failed to parse int length");
|
||||||
|
e.initCause(pe);
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
result.bytes = new byte[len];
|
||||||
|
result.offset = 0;
|
||||||
|
result.length = len;
|
||||||
|
in.readBytes(result.bytes, 0, len);
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
throw new RuntimeException(ioe);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getValueCount() {
|
||||||
|
return field.numValues;
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.math.BigInteger;
|
||||||
import java.text.DecimalFormat;
|
import java.text.DecimalFormat;
|
||||||
import java.text.DecimalFormatSymbols;
|
import java.text.DecimalFormatSymbols;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
@ -252,7 +253,111 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
|
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
|
||||||
throw new UnsupportedOperationException(); // nocommit
|
assert fieldSeen(field.name);
|
||||||
|
assert field.getDocValuesType() == DocValuesType.SORTED_SET;
|
||||||
|
writeFieldEntry(field, FieldInfo.DocValuesType.SORTED_SET);
|
||||||
|
|
||||||
|
long valueCount = 0;
|
||||||
|
int maxLength = 0;
|
||||||
|
for(BytesRef value : values) {
|
||||||
|
maxLength = Math.max(maxLength, value.length);
|
||||||
|
valueCount++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// write numValues
|
||||||
|
SimpleTextUtil.write(data, NUMVALUES);
|
||||||
|
SimpleTextUtil.write(data, Long.toString(valueCount), scratch);
|
||||||
|
SimpleTextUtil.writeNewline(data);
|
||||||
|
|
||||||
|
// write maxLength
|
||||||
|
SimpleTextUtil.write(data, MAXLENGTH);
|
||||||
|
SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
|
||||||
|
SimpleTextUtil.writeNewline(data);
|
||||||
|
|
||||||
|
int maxBytesLength = Integer.toString(maxLength).length();
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (int i = 0; i < maxBytesLength; i++) {
|
||||||
|
sb.append('0');
|
||||||
|
}
|
||||||
|
|
||||||
|
// write our pattern for encoding lengths
|
||||||
|
SimpleTextUtil.write(data, PATTERN);
|
||||||
|
SimpleTextUtil.write(data, sb.toString(), scratch);
|
||||||
|
SimpleTextUtil.writeNewline(data);
|
||||||
|
final DecimalFormat encoder = new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));
|
||||||
|
|
||||||
|
// compute ord pattern: this is funny, we encode all values for all docs to find the maximum length
|
||||||
|
int maxOrdListLength = 0;
|
||||||
|
StringBuilder sb2 = new StringBuilder();
|
||||||
|
Iterator<Number> ordStream = ords.iterator();
|
||||||
|
for (Number n : docToOrdCount) {
|
||||||
|
sb2.setLength(0);
|
||||||
|
int count = n.intValue();
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
long ord = ordStream.next().longValue();
|
||||||
|
if (sb2.length() > 0) {
|
||||||
|
sb2.append(",");
|
||||||
|
}
|
||||||
|
sb2.append(Long.toString(ord));
|
||||||
|
}
|
||||||
|
maxOrdListLength = Math.max(maxOrdListLength, sb2.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
sb2.setLength(0);
|
||||||
|
for (int i = 0; i < maxOrdListLength; i++) {
|
||||||
|
sb2.append('X');
|
||||||
|
}
|
||||||
|
|
||||||
|
// write our pattern for ord lists
|
||||||
|
SimpleTextUtil.write(data, ORDPATTERN);
|
||||||
|
SimpleTextUtil.write(data, sb2.toString(), scratch);
|
||||||
|
SimpleTextUtil.writeNewline(data);
|
||||||
|
|
||||||
|
// for asserts:
|
||||||
|
long valuesSeen = 0;
|
||||||
|
|
||||||
|
for(BytesRef value : values) {
|
||||||
|
// write length
|
||||||
|
SimpleTextUtil.write(data, LENGTH);
|
||||||
|
SimpleTextUtil.write(data, encoder.format(value.length), scratch);
|
||||||
|
SimpleTextUtil.writeNewline(data);
|
||||||
|
|
||||||
|
// write bytes -- don't use SimpleText.write
|
||||||
|
// because it escapes:
|
||||||
|
data.writeBytes(value.bytes, value.offset, value.length);
|
||||||
|
|
||||||
|
// pad to fit
|
||||||
|
for (int i = value.length; i < maxLength; i++) {
|
||||||
|
data.writeByte((byte)' ');
|
||||||
|
}
|
||||||
|
SimpleTextUtil.writeNewline(data);
|
||||||
|
valuesSeen++;
|
||||||
|
assert valuesSeen <= valueCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert valuesSeen == valueCount;
|
||||||
|
|
||||||
|
ordStream = ords.iterator();
|
||||||
|
|
||||||
|
// write the ords for each doc comma-separated
|
||||||
|
for(Number n : docToOrdCount) {
|
||||||
|
sb2.setLength(0);
|
||||||
|
int count = n.intValue();
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
long ord = ordStream.next().longValue();
|
||||||
|
if (sb2.length() > 0) {
|
||||||
|
sb2.append(",");
|
||||||
|
}
|
||||||
|
sb2.append(Long.toString(ord));
|
||||||
|
}
|
||||||
|
// now pad to fit: these are numbers so spaces work well. reader calls trim()
|
||||||
|
int numPadding = maxOrdListLength - sb2.length();
|
||||||
|
for (int i = 0; i < numPadding; i++) {
|
||||||
|
sb2.append(' ');
|
||||||
|
}
|
||||||
|
SimpleTextUtil.write(data, sb2.toString(), scratch);
|
||||||
|
SimpleTextUtil.writeNewline(data);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** write the header for this field */
|
/** write the header for this field */
|
||||||
|
|
|
@ -55,7 +55,7 @@ import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
|
||||||
*/
|
*/
|
||||||
// nocommit: should only be Lucene40 and Lucene41
|
// nocommit: should only be Lucene40 and Lucene41
|
||||||
// nocommit: move to BaseDocValuesTestCase, but allow these to be assume()d (for 4.0 and 4.1)
|
// nocommit: move to BaseDocValuesTestCase, but allow these to be assume()d (for 4.0 and 4.1)
|
||||||
@SuppressCodecs({ "Lucene40", "Lucene41", "SimpleText" })
|
@SuppressCodecs({ "Lucene40", "Lucene41" })
|
||||||
public class TestDemoDocValue extends LuceneTestCase {
|
public class TestDemoDocValue extends LuceneTestCase {
|
||||||
|
|
||||||
public void testSortedSetOneValue() throws IOException {
|
public void testSortedSetOneValue() throws IOException {
|
||||||
|
|
Loading…
Reference in New Issue