ensure values are sent to codec in sorted order: beef up assertingcodec and add more simple tests

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4765@1444653 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-02-11 03:43:12 +00:00
parent 2dcf80718c
commit 7bd948623e
3 changed files with 246 additions and 10 deletions

View File

@ -46,6 +46,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
private int currentDoc;
private int currentValues[] = new int[8];
private int currentUpto = 0;
private int maxCount = 0;
public SortedSetDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
@ -83,7 +84,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
updateBytesUsed();
}
// finalize currentDoc
// finalize currentDoc: this deduplicates the current term ids
private void finishCurrentDoc() {
Arrays.sort(currentValues, 0, currentUpto);
int lastValue = -1;
@ -99,6 +100,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
}
// record the number of unique ords for this doc
pendingCounts.add(count);
maxCount = Math.max(maxCount, count);
currentUpto = 0;
currentDoc++;
}
@ -127,7 +129,9 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
if (currentUpto == currentValues.length) {
currentValues = ArrayUtil.grow(currentValues, currentValues.length+1);
iwBytesUsed.addAndGet((currentValues.length - currentUpto) * RamUsageEstimator.NUM_BYTES_INT);
// reserve additional space for max # values per-doc
// when flushing, we need an int[] to sort the mapped-ords within the doc
iwBytesUsed.addAndGet((currentValues.length - currentUpto) * 2 * RamUsageEstimator.NUM_BYTES_INT);
}
currentValues[currentUpto] = ord;
@ -143,7 +147,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
@Override
public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException {
final int maxDoc = state.segmentInfo.getDocCount();
final int maxCountPerDoc = maxCount;
assert pendingCounts.size() == maxDoc;
final int valueCount = hash.size();
@ -176,7 +180,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
new Iterable<Number>() {
@Override
public Iterator<Number> iterator() {
return new OrdsIterator(ordMap);
return new OrdsIterator(ordMap, maxCountPerDoc);
}
});
}
@ -221,11 +225,17 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
// iterates over the ords for each doc we have in ram
private class OrdsIterator implements Iterator<Number> {
final AppendingLongBuffer.Iterator iter = pending.iterator();
final AppendingLongBuffer.Iterator counts = pendingCounts.iterator();
final int ordMap[];
final long numOrds;
long ordUpto;
OrdsIterator(int ordMap[]) {
final int currentDoc[];
int currentUpto;
int currentLength;
OrdsIterator(int ordMap[], int maxCount) {
this.currentDoc = new int[maxCount];
this.ordMap = ordMap;
this.numOrds = pending.size();
}
@ -240,10 +250,20 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
if (!hasNext()) {
throw new NoSuchElementException();
}
int ord = (int) iter.next();
if (currentUpto == currentLength) {
// refill next doc, and sort remapped ords within the doc.
currentUpto = 0;
currentLength = (int) counts.next();
for (int i = 0; i < currentLength; i++) {
currentDoc[i] = ordMap[(int) iter.next()];
}
Arrays.sort(currentDoc, 0, currentLength);
}
int ord = currentDoc[currentUpto];
currentUpto++;
ordUpto++;
// TODO: make reusable Number
return ordMap[ord];
return ord;
}
@Override

View File

@ -163,6 +163,39 @@ public class TestDemoDocValue extends LuceneTestCase {
directory.close();
}
public void testTwoValuesUnordered() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());
// Store the index in memory:
Directory directory = newDirectory();
// To store an index on disk, use this instead:
// Directory directory = FSDirectory.open(new File("/tmp/testindex"));
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, analyzer);
Document doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("world")));
doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
iwriter.addDocument(doc);
iwriter.close();
// Now search the index:
DirectoryReader ireader = DirectoryReader.open(directory); // read-only=true
SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
OrdIterator oi = dv.getOrds(0, null);
assertEquals(0, oi.nextOrd());
assertEquals(1, oi.nextOrd());
assertEquals(OrdIterator.NO_MORE_ORDS, oi.nextOrd());
BytesRef bytes = new BytesRef();
dv.lookupOrd(0, bytes);
assertEquals(new BytesRef("hello"), bytes);
dv.lookupOrd(1, bytes);
assertEquals(new BytesRef("world"), bytes);
ireader.close();
directory.close();
}
public void testThreeValuesTwoDocs() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());
@ -213,4 +246,146 @@ public class TestDemoDocValue extends LuceneTestCase {
ireader.close();
directory.close();
}
public void testTwoDocumentsLastMissing() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());
// Store the index in memory:
Directory directory = newDirectory();
// To store an index on disk, use this instead:
// Directory directory = FSDirectory.open(new File("/tmp/testindex"));
IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwconfig.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
Document doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
iwriter.addDocument(doc);
doc = new Document();
iwriter.addDocument(doc);
iwriter.forceMerge(1);
iwriter.close();
// Now search the index:
DirectoryReader ireader = DirectoryReader.open(directory); // read-only=true
SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
OrdIterator oi = dv.getOrds(0, null);
assertEquals(0, oi.nextOrd());
assertEquals(OrdIterator.NO_MORE_ORDS, oi.nextOrd());
BytesRef bytes = new BytesRef();
dv.lookupOrd(0, bytes);
assertEquals(new BytesRef("hello"), bytes);
assertEquals(1, dv.getValueCount());
ireader.close();
directory.close();
}
public void testTwoDocumentsLastMissingMerge() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());
// Store the index in memory:
Directory directory = newDirectory();
// To store an index on disk, use this instead:
// Directory directory = FSDirectory.open(new File("/tmp/testindex"));
IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwconfig.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
Document doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
iwriter.addDocument(doc);
iwriter.commit();
doc = new Document();
iwriter.addDocument(doc);
iwriter.forceMerge(1);
iwriter.close();
// Now search the index:
DirectoryReader ireader = DirectoryReader.open(directory); // read-only=true
SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
OrdIterator oi = dv.getOrds(0, null);
assertEquals(0, oi.nextOrd());
assertEquals(OrdIterator.NO_MORE_ORDS, oi.nextOrd());
BytesRef bytes = new BytesRef();
dv.lookupOrd(0, bytes);
assertEquals(new BytesRef("hello"), bytes);
assertEquals(1, dv.getValueCount());
ireader.close();
directory.close();
}
public void testTwoDocumentsFirstMissing() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());
// Store the index in memory:
Directory directory = newDirectory();
// To store an index on disk, use this instead:
// Directory directory = FSDirectory.open(new File("/tmp/testindex"));
IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwconfig.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
Document doc = new Document();
iwriter.addDocument(doc);
doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
iwriter.addDocument(doc);
iwriter.forceMerge(1);
iwriter.close();
// Now search the index:
DirectoryReader ireader = DirectoryReader.open(directory); // read-only=true
SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
OrdIterator oi = dv.getOrds(1, null);
assertEquals(0, oi.nextOrd());
assertEquals(OrdIterator.NO_MORE_ORDS, oi.nextOrd());
BytesRef bytes = new BytesRef();
dv.lookupOrd(0, bytes);
assertEquals(new BytesRef("hello"), bytes);
assertEquals(1, dv.getValueCount());
ireader.close();
directory.close();
}
public void testTwoDocumentsFirstMissingMerge() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());
// Store the index in memory:
Directory directory = newDirectory();
// To store an index on disk, use this instead:
// Directory directory = FSDirectory.open(new File("/tmp/testindex"));
IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwconfig.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig);
Document doc = new Document();
iwriter.addDocument(doc);
iwriter.commit();
doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("hello")));
iwriter.addDocument(doc);
iwriter.forceMerge(1);
iwriter.close();
// Now search the index:
DirectoryReader ireader = DirectoryReader.open(directory); // read-only=true
SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field");
OrdIterator oi = dv.getOrds(1, null);
assertEquals(0, oi.nextOrd());
assertEquals(OrdIterator.NO_MORE_ORDS, oi.nextOrd());
BytesRef bytes = new BytesRef();
dv.lookupOrd(0, bytes);
assertEquals(new BytesRef("hello"), bytes);
assertEquals(1, dv.getValueCount());
ireader.close();
directory.close();
}
}

View File

@ -130,12 +130,53 @@ public class AssertingDocValuesFormat extends DocValuesFormat {
@Override
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
// nocommit: add checks
long valueCount = 0;
BytesRef lastValue = null;
for (BytesRef b : values) {
assert b != null;
assert b.isValid();
if (valueCount > 0) {
assert b.compareTo(lastValue) > 0;
}
lastValue = BytesRef.deepCopyOf(b);
valueCount++;
}
int docCount = 0;
long ordCount = 0;
// nocommit
FixedBitSet seenOrds = new FixedBitSet((int)valueCount);
Iterator<Number> ordIterator = ords.iterator();
for (Number v : docToOrdCount) {
assert v != null;
int count = v.intValue();
assert count >= 0;
docCount++;
ordCount += count;
long lastOrd = -1;
for (int i = 0; i < count; i++) {
Number o = ordIterator.next();
assert o != null;
long ord = o.longValue();
assert ord >= 0 && ord < valueCount;
assert ord > lastOrd : "ord=" + ord + ",lastOrd=" + lastOrd;
seenOrds.set((int)ord); // nocommit
lastOrd = ord;
}
}
assert ordIterator.hasNext() == false;
assert docCount == maxDoc;
assert seenOrds.cardinality() == valueCount;
checkIterator(values.iterator(), valueCount);
checkIterator(docToOrdCount.iterator(), maxDoc);
checkIterator(ords.iterator(), ordCount);
in.addSortedSetField(field, values, docToOrdCount, ords);
}
private <T> void checkIterator(Iterator<T> iterator, int expectedSize) {
for (int i = 0; i < expectedSize; i++) {
private <T> void checkIterator(Iterator<T> iterator, long expectedSize) {
for (long i = 0; i < expectedSize; i++) {
boolean hasNext = iterator.hasNext();
assert hasNext;
T v = iterator.next();