LUCENE-3129: BlockGroupingCollector wasn't tracking scores correctly; fang'd up TestGrouping to reveal the bug

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1131158 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-06-03 19:31:51 +00:00
parent 27d8311ffc
commit 01c8469ab3
5 changed files with 187 additions and 52 deletions

View File

@ -181,9 +181,9 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
// term text into textStart address // term text into textStart address
// Get the text & hash of this term. // Get the text & hash of this term.
int termID; int termID;
try{ try {
termID = bytesHash.add(termBytesRef, termAtt.fillBytesRef()); termID = bytesHash.add(termBytesRef, termAtt.fillBytesRef());
}catch (MaxBytesLengthExceededException e) { } catch (MaxBytesLengthExceededException e) {
// Not enough room in current block // Not enough room in current block
// Just skip this term, to remain as robust as // Just skip this term, to remain as robust as
// possible during indexing. A TokenFilter // possible during indexing. A TokenFilter

View File

@ -212,7 +212,7 @@ public class BlockGroupingCollector extends Collector {
// Swap pending scores // Swap pending scores
final float[] savScores = og.scores; final float[] savScores = og.scores;
og.scores = pendingSubScores; og.scores = pendingSubScores;
pendingSubScores = og.scores; pendingSubScores = savScores;
} }
og.readerContext = currentReaderContext; og.readerContext = currentReaderContext;
//og.groupOrd = lastGroupOrd; //og.groupOrd = lastGroupOrd;

View File

@ -26,7 +26,7 @@ import java.io.IOException;
/** /**
* Concrete implementation of {@link AbstractFirstPassGroupingCollector} that groups based on * Concrete implementation of {@link AbstractFirstPassGroupingCollector} that groups based on
* field values and more specifically uses {@link org.apache.lucene.search.FieldCache.DocTerms} * field values and more specifically uses {@link org.apache.lucene.search.FieldCache.DocTermsIndex}
* to collect groups. * to collect groups.
* *
* @lucene.experimental * @lucene.experimental

View File

@ -27,7 +27,7 @@ import java.util.Collection;
/** /**
* Concrete implementation of {@link AbstractSecondPassGroupingCollector} that groups based on * Concrete implementation of {@link AbstractSecondPassGroupingCollector} that groups based on
* field values and more specifically uses {@link org.apache.lucene.search.FieldCache.DocTerms} * field values and more specifically uses {@link org.apache.lucene.search.FieldCache.DocTermsIndex}
* to collect grouped docs. * to collect grouped docs.
* *
* @lucene.experimental * @lucene.experimental

View File

@ -154,7 +154,10 @@ public class TestGrouping extends LuceneTestCase {
final BytesRef group; final BytesRef group;
final BytesRef sort1; final BytesRef sort1;
final BytesRef sort2; final BytesRef sort2;
// content must be "realN ..."
final String content; final String content;
float score;
float score2;
public GroupDoc(int id, BytesRef group, BytesRef sort1, BytesRef sort2, String content) { public GroupDoc(int id, BytesRef group, BytesRef sort1, BytesRef sort2, String content) {
this.id = id; this.id = id;
@ -167,6 +170,9 @@ public class TestGrouping extends LuceneTestCase {
private Sort getRandomSort() { private Sort getRandomSort() {
final List<SortField> sortFields = new ArrayList<SortField>(); final List<SortField> sortFields = new ArrayList<SortField>();
if (random.nextInt(7) == 2) {
sortFields.add(SortField.FIELD_SCORE);
} else {
if (random.nextBoolean()) { if (random.nextBoolean()) {
if (random.nextBoolean()) { if (random.nextBoolean()) {
sortFields.add(new SortField("sort1", SortField.STRING, random.nextBoolean())); sortFields.add(new SortField("sort1", SortField.STRING, random.nextBoolean()));
@ -177,6 +183,8 @@ public class TestGrouping extends LuceneTestCase {
sortFields.add(new SortField("sort1", SortField.STRING, random.nextBoolean())); sortFields.add(new SortField("sort1", SortField.STRING, random.nextBoolean()));
sortFields.add(new SortField("sort2", SortField.STRING, random.nextBoolean())); sortFields.add(new SortField("sort2", SortField.STRING, random.nextBoolean()));
} }
}
// Break ties:
sortFields.add(new SortField("id", SortField.INT)); sortFields.add(new SortField("id", SortField.INT));
return new Sort(sortFields.toArray(new SortField[sortFields.size()])); return new Sort(sortFields.toArray(new SortField[sortFields.size()]));
} }
@ -188,7 +196,15 @@ public class TestGrouping extends LuceneTestCase {
public int compare(GroupDoc d1, GroupDoc d2) { public int compare(GroupDoc d1, GroupDoc d2) {
for(SortField sf : sortFields) { for(SortField sf : sortFields) {
final int cmp; final int cmp;
if (sf.getField().equals("sort1")) { if (sf.getType() == SortField.SCORE) {
if (d1.score > d2.score) {
cmp = -1;
} else if (d1.score < d2.score) {
cmp = 1;
} else {
cmp = 0;
}
} else if (sf.getField().equals("sort1")) {
cmp = d1.sort1.compareTo(d2.sort1); cmp = d1.sort1.compareTo(d2.sort1);
} else if (sf.getField().equals("sort2")) { } else if (sf.getField().equals("sort2")) {
cmp = d1.sort2.compareTo(d2.sort2); cmp = d1.sort2.compareTo(d2.sort2);
@ -213,7 +229,9 @@ public class TestGrouping extends LuceneTestCase {
for(int fieldIDX=0;fieldIDX<sortFields.length;fieldIDX++) { for(int fieldIDX=0;fieldIDX<sortFields.length;fieldIDX++) {
final Comparable<?> c; final Comparable<?> c;
final SortField sf = sortFields[fieldIDX]; final SortField sf = sortFields[fieldIDX];
if (sf.getField().equals("sort1")) { if (sf.getType() == SortField.SCORE) {
c = new Float(d.score);
} else if (sf.getField().equals("sort1")) {
c = d.sort1; c = d.sort1;
} else if (sf.getField().equals("sort2")) { } else if (sf.getField().equals("sort2")) {
c = d.sort2; c = d.sort2;
@ -262,11 +280,11 @@ public class TestGrouping extends LuceneTestCase {
//System.out.println("TEST: slowGrouping"); //System.out.println("TEST: slowGrouping");
for(GroupDoc d : groupDocs) { for(GroupDoc d : groupDocs) {
// TODO: would be better to filter by searchTerm before sorting! // TODO: would be better to filter by searchTerm before sorting!
if (!d.content.equals(searchTerm)) { if (!d.content.startsWith(searchTerm)) {
continue; continue;
} }
totalHitCount++; totalHitCount++;
//System.out.println(" match id=" + d.id); //System.out.println(" match id=" + d.id + " score=" + d.score);
if (doAllGroups) { if (doAllGroups) {
if (!knownGroups.contains(d.group)) { if (!knownGroups.contains(d.group)) {
@ -312,9 +330,9 @@ public class TestGrouping extends LuceneTestCase {
final GroupDoc d = docs.get(docIDX); final GroupDoc d = docs.get(docIDX);
final FieldDoc fd; final FieldDoc fd;
if (fillFields) { if (fillFields) {
fd = new FieldDoc(d.id, 0.0f, fillFields(d, docSort)); fd = new FieldDoc(d.id, getScores ? d.score : Float.NaN, fillFields(d, docSort));
} else { } else {
fd = new FieldDoc(d.id, 0.0f); fd = new FieldDoc(d.id, getScores ? d.score : Float.NaN);
} }
hits[docIDX-docOffset] = fd; hits[docIDX-docOffset] = fd;
} }
@ -373,7 +391,7 @@ public class TestGrouping extends LuceneTestCase {
doc.add(newField("sort1", groupValue.sort1.utf8ToString(), Field.Index.NOT_ANALYZED)); doc.add(newField("sort1", groupValue.sort1.utf8ToString(), Field.Index.NOT_ANALYZED));
doc.add(newField("sort2", groupValue.sort2.utf8ToString(), Field.Index.NOT_ANALYZED)); doc.add(newField("sort2", groupValue.sort2.utf8ToString(), Field.Index.NOT_ANALYZED));
doc.add(new NumericField("id").setIntValue(groupValue.id)); doc.add(new NumericField("id").setIntValue(groupValue.id));
doc.add(newField("content", groupValue.content, Field.Index.NOT_ANALYZED)); doc.add(newField("content", groupValue.content, Field.Index.ANALYZED));
//System.out.println("TEST: doc content=" + groupValue.content + " group=" + (groupValue.group == null ? "null" : groupValue.group.utf8ToString()) + " sort1=" + groupValue.sort1.utf8ToString() + " id=" + groupValue.id); //System.out.println("TEST: doc content=" + groupValue.content + " group=" + (groupValue.group == null ? "null" : groupValue.group.utf8ToString()) + " sort1=" + groupValue.sort1.utf8ToString() + " id=" + groupValue.id);
} }
// So we can pull filter marking last doc in block: // So we can pull filter marking last doc in block:
@ -421,7 +439,22 @@ public class TestGrouping extends LuceneTestCase {
groups.add(new BytesRef(_TestUtil.randomRealisticUnicodeString(random))); groups.add(new BytesRef(_TestUtil.randomRealisticUnicodeString(random)));
//groups.add(new BytesRef(_TestUtil.randomSimpleString(random))); //groups.add(new BytesRef(_TestUtil.randomSimpleString(random)));
} }
final String[] contentStrings = new String[] {"a", "b", "c", "d"}; final String[] contentStrings = new String[_TestUtil.nextInt(random, 2, 20)];
if (VERBOSE) {
System.out.println("TEST: create fake content");
}
for(int contentIDX=0;contentIDX<contentStrings.length;contentIDX++) {
final StringBuilder sb = new StringBuilder();
sb.append("real" + random.nextInt(3)).append(' ');
final int fakeCount = random.nextInt(10);
for(int fakeIDX=0;fakeIDX<fakeCount;fakeIDX++) {
sb.append("fake ");
}
contentStrings[contentIDX] = sb.toString();
if (VERBOSE) {
System.out.println(" content=" + sb.toString());
}
}
Directory dir = newDirectory(); Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter( RandomIndexWriter w = new RandomIndexWriter(
@ -440,7 +473,7 @@ public class TestGrouping extends LuceneTestCase {
Field sort2 = newField("sort2", "", Field.Index.NOT_ANALYZED); Field sort2 = newField("sort2", "", Field.Index.NOT_ANALYZED);
doc.add(sort2); doc.add(sort2);
docNoGroup.add(sort2); docNoGroup.add(sort2);
Field content = newField("content", "", Field.Index.NOT_ANALYZED); Field content = newField("content", "", Field.Index.ANALYZED);
doc.add(content); doc.add(content);
docNoGroup.add(content); docNoGroup.add(content);
NumericField id = new NumericField("id"); NumericField id = new NumericField("id");
@ -480,40 +513,96 @@ public class TestGrouping extends LuceneTestCase {
} }
} }
final GroupDoc[] groupDocsByID = new GroupDoc[groupDocs.length];
System.arraycopy(groupDocs, 0, groupDocsByID, 0, groupDocs.length);
final IndexReader r = w.getReader(); final IndexReader r = w.getReader();
w.close(); w.close();
// Build 2nd index, where docs are added in blocks by // NOTE: intentional but temporary field cache insanity!
// group, so we can use single pass collector
final Directory dir2 = newDirectory();
final IndexReader r2 = getDocBlockReader(dir2, groupDocs);
final Filter lastDocInBlock = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("groupend", "x"))));
final IndexSearcher s = new IndexSearcher(r);
final IndexSearcher s2 = new IndexSearcher(r2);
final int[] docIDToID = FieldCache.DEFAULT.getInts(r, "id"); final int[] docIDToID = FieldCache.DEFAULT.getInts(r, "id");
final int[] docIDToID2 = FieldCache.DEFAULT.getInts(r2, "id"); IndexReader r2 = null;
Directory dir2 = null;
try { try {
final IndexSearcher s = new IndexSearcher(r);
for(int contentID=0;contentID<3;contentID++) {
final ScoreDoc[] hits = s.search(new TermQuery(new Term("content", "real"+contentID)), numDocs).scoreDocs;
for(ScoreDoc hit : hits) {
final GroupDoc gd = groupDocs[docIDToID[hit.doc]];
assertTrue(gd.score == 0.0);
gd.score = hit.score;
assertEquals(gd.id, docIDToID[hit.doc]);
//System.out.println(" score=" + hit.score + " id=" + docIDToID[hit.doc]);
}
}
for(GroupDoc gd : groupDocs) {
assertTrue(gd.score != 0.0);
}
// Build 2nd index, where docs are added in blocks by
// group, so we can use single pass collector
dir2 = newDirectory();
r2 = getDocBlockReader(dir2, groupDocs);
final Filter lastDocInBlock = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("groupend", "x"))));
final int[] docIDToID2 = FieldCache.DEFAULT.getInts(r2, "id");
final IndexSearcher s2 = new IndexSearcher(r2);
// Reader2 only increases maxDoc() vs reader, which
// means a monotonic shift in scores, so we can
// reliably remap them w/ Map:
final Map<Float,Float> scoreMap = new HashMap<Float,Float>();
// Tricky: must separately set .score2, because the doc
// block index was created with possible deletions!
for(int contentID=0;contentID<3;contentID++) {
//System.out.println("term=real" + contentID + " dfold=" + s.docFreq(new Term("content", "real"+contentID)) +
//" dfnew=" + s2.docFreq(new Term("content", "real"+contentID)));
final ScoreDoc[] hits = s2.search(new TermQuery(new Term("content", "real"+contentID)), numDocs).scoreDocs;
for(ScoreDoc hit : hits) {
final GroupDoc gd = groupDocsByID[docIDToID2[hit.doc]];
assertTrue(gd.score2 == 0.0);
gd.score2 = hit.score;
assertEquals(gd.id, docIDToID2[hit.doc]);
//System.out.println(" score=" + hit.score + " id=" + docIDToID2[hit.doc]);
scoreMap.put(gd.score, gd.score2);
}
}
for(int searchIter=0;searchIter<100;searchIter++) { for(int searchIter=0;searchIter<100;searchIter++) {
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: searchIter=" + searchIter); System.out.println("TEST: searchIter=" + searchIter);
} }
final String searchTerm = contentStrings[random.nextInt(contentStrings.length)]; final String searchTerm = "real" + random.nextInt(3);
final boolean fillFields = random.nextBoolean(); final boolean fillFields = random.nextBoolean();
final boolean getScores = random.nextBoolean(); boolean getScores = random.nextBoolean();
final boolean getMaxScores = random.nextBoolean(); final boolean getMaxScores = random.nextBoolean();
final Sort groupSort = getRandomSort(); final Sort groupSort = getRandomSort();
//final Sort groupSort = new Sort(new SortField[] {new SortField("sort1", SortField.STRING), new SortField("id", SortField.INT)}); //final Sort groupSort = new Sort(new SortField[] {new SortField("sort1", SortField.STRING), new SortField("id", SortField.INT)});
// TODO: also test null (= sort by relevance) // TODO: also test null (= sort by relevance)
final Sort docSort = getRandomSort(); final Sort docSort = getRandomSort();
for(SortField sf : docSort.getSort()) {
if (sf.getType() == SortField.SCORE) {
getScores = true;
}
}
for(SortField sf : groupSort.getSort()) {
if (sf.getType() == SortField.SCORE) {
getScores = true;
}
}
final int topNGroups = _TestUtil.nextInt(random, 1, 30); final int topNGroups = _TestUtil.nextInt(random, 1, 30);
//final int topNGroups = 4; //final int topNGroups = 4;
final int docsPerGroup = _TestUtil.nextInt(random, 1, 50); final int docsPerGroup = _TestUtil.nextInt(random, 1, 50);
final int groupOffset = _TestUtil.nextInt(random, 0, (topNGroups-1)/2); final int groupOffset = _TestUtil.nextInt(random, 0, (topNGroups-1)/2);
//final int groupOffset = 0; //final int groupOffset = 0;
@ -523,7 +612,7 @@ public class TestGrouping extends LuceneTestCase {
final boolean doCache = random.nextBoolean(); final boolean doCache = random.nextBoolean();
final boolean doAllGroups = random.nextBoolean(); final boolean doAllGroups = random.nextBoolean();
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups); System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups + " getScores=" + getScores + " getMaxScores=" + getMaxScores);
} }
final TermAllGroupsCollector allGroupsCollector; final TermAllGroupsCollector allGroupsCollector;
@ -636,13 +725,12 @@ public class TestGrouping extends LuceneTestCase {
for(GroupDocs<BytesRef> gd : expectedGroups.groups) { for(GroupDocs<BytesRef> gd : expectedGroups.groups) {
System.out.println(" group=" + (gd.groupValue == null ? "null" : gd.groupValue.utf8ToString())); System.out.println(" group=" + (gd.groupValue == null ? "null" : gd.groupValue.utf8ToString()));
for(ScoreDoc sd : gd.scoreDocs) { for(ScoreDoc sd : gd.scoreDocs) {
System.out.println(" id=" + sd.doc); System.out.println(" id=" + sd.doc + " score=" + sd.score);
} }
} }
} }
} }
// NOTE: intentional but temporary field cache insanity! assertEquals(docIDToID, expectedGroups, groupsResult, true, getScores);
assertEquals(docIDToID, expectedGroups, groupsResult, true);
final boolean needsScores = getScores || getMaxScores || docSort == null; final boolean needsScores = getScores || getMaxScores || docSort == null;
final BlockGroupingCollector c3 = new BlockGroupingCollector(groupSort, groupOffset+topNGroups, needsScores, lastDocInBlock); final BlockGroupingCollector c3 = new BlockGroupingCollector(groupSort, groupOffset+topNGroups, needsScores, lastDocInBlock);
@ -665,12 +753,54 @@ public class TestGrouping extends LuceneTestCase {
} else { } else {
groupsResult2 = tempTopGroups2; groupsResult2 = tempTopGroups2;
} }
assertEquals(docIDToID2, expectedGroups, groupsResult2, false);
if (expectedGroups != null) {
// Fixup scores for reader2
for (GroupDocs groupDocsHits : expectedGroups.groups) {
for(ScoreDoc hit : groupDocsHits.scoreDocs) {
final GroupDoc gd = groupDocsByID[hit.doc];
assertEquals(gd.id, hit.doc);
//System.out.println("fixup score " + hit.score + " to " + gd.score2 + " vs " + gd.score);
hit.score = gd.score2;
}
}
final SortField[] sortFields = groupSort.getSort();
for(int groupSortIDX=0;groupSortIDX<sortFields.length;groupSortIDX++) {
if (sortFields[groupSortIDX].getType() == SortField.SCORE) {
for (GroupDocs groupDocsHits : expectedGroups.groups) {
if (groupDocsHits.groupSortValues != null) {
groupDocsHits.groupSortValues[groupSortIDX] = scoreMap.get(groupDocsHits.groupSortValues[groupSortIDX]);
assertNotNull(groupDocsHits.groupSortValues[groupSortIDX]);
}
}
}
}
final SortField[] docSortFields = docSort.getSort();
for(int docSortIDX=0;docSortIDX<docSortFields.length;docSortIDX++) {
if (docSortFields[docSortIDX].getType() == SortField.SCORE) {
for (GroupDocs groupDocsHits : expectedGroups.groups) {
for(ScoreDoc _hit : groupDocsHits.scoreDocs) {
FieldDoc hit = (FieldDoc) _hit;
if (hit.fields != null) {
hit.fields[docSortIDX] = scoreMap.get(hit.fields[docSortIDX]);
assertNotNull(hit.fields[docSortIDX]);
}
}
}
}
}
}
assertEquals(docIDToID2, expectedGroups, groupsResult2, false, getScores);
} }
} finally { } finally {
FieldCache.DEFAULT.purge(r); FieldCache.DEFAULT.purge(r);
if (r2 != null) {
FieldCache.DEFAULT.purge(r2); FieldCache.DEFAULT.purge(r2);
} }
}
r.close(); r.close();
dir.close(); dir.close();
@ -680,7 +810,7 @@ public class TestGrouping extends LuceneTestCase {
} }
} }
private void assertEquals(int[] docIDtoID, TopGroups expected, TopGroups actual, boolean verifyGroupValues) { private void assertEquals(int[] docIDtoID, TopGroups expected, TopGroups actual, boolean verifyGroupValues, boolean testScores) {
if (expected == null) { if (expected == null) {
assertNull(actual); assertNull(actual);
return; return;
@ -716,9 +846,14 @@ public class TestGrouping extends LuceneTestCase {
for(int docIDX=0;docIDX<expectedFDs.length;docIDX++) { for(int docIDX=0;docIDX<expectedFDs.length;docIDX++) {
final FieldDoc expectedFD = (FieldDoc) expectedFDs[docIDX]; final FieldDoc expectedFD = (FieldDoc) expectedFDs[docIDX];
final FieldDoc actualFD = (FieldDoc) actualFDs[docIDX]; final FieldDoc actualFD = (FieldDoc) actualFDs[docIDX];
//System.out.println(" actual doc=" + docIDtoID[actualFD.doc] + " score=" + actualFD.score);
assertEquals(expectedFD.doc, docIDtoID[actualFD.doc]); assertEquals(expectedFD.doc, docIDtoID[actualFD.doc]);
// TODO if (testScores) {
// assertEquals(expectedFD.score, actualFD.score); assertEquals(expectedFD.score, actualFD.score);
} else {
// TODO: too anal for now
//assertEquals(Float.NaN, actualFD.score);
}
assertArrayEquals(expectedFD.fields, actualFD.fields); assertArrayEquals(expectedFD.fields, actualFD.fields);
} }
} }