mirror of https://github.com/apache/lucene.git
LUCENE-4290: add some more testing for this sandy highlighter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1426072 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5772204892
commit
c98812b8df
|
@ -0,0 +1,234 @@
|
||||||
|
package org.apache.lucene.sandbox.postingshighlight;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.BreakIterator;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.document.FieldType;
|
||||||
|
import org.apache.lucene.document.StringField;
|
||||||
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||||
|
|
||||||
|
@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom"})
|
||||||
|
public class TestPostingsHighlighterRanking extends LuceneTestCase {
|
||||||
|
/**
|
||||||
|
* indexes a bunch of gibberish, and then highlights top(n).
|
||||||
|
* asserts that top(n) highlights is a subset of top(n+1) up to some max N
|
||||||
|
*/
|
||||||
|
// TODO: this only tests single-valued fields. we should also index multiple values per field!
|
||||||
|
public void testRanking() throws Exception {
|
||||||
|
// number of documents: we will check each one
|
||||||
|
final int numDocs = atLeast(100);
|
||||||
|
// number of top-N snippets, we will check 1 .. N
|
||||||
|
final int maxTopN = 5;
|
||||||
|
// maximum number of elements to put in a sentence.
|
||||||
|
final int maxSentenceLength = 10;
|
||||||
|
// maximum number of sentences in a document
|
||||||
|
final int maxNumSentences = 20;
|
||||||
|
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
|
||||||
|
Document document = new Document();
|
||||||
|
Field id = new StringField("id", "", Field.Store.NO);
|
||||||
|
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
|
||||||
|
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||||
|
Field body = new Field("body", "", offsetsType);
|
||||||
|
document.add(id);
|
||||||
|
document.add(body);
|
||||||
|
|
||||||
|
for (int i = 0; i < numDocs; i++) {;
|
||||||
|
StringBuilder bodyText = new StringBuilder();
|
||||||
|
int numSentences = _TestUtil.nextInt(random(), 1, maxNumSentences);
|
||||||
|
for (int j = 0; j < numSentences; j++) {
|
||||||
|
bodyText.append(newSentence(random(), maxSentenceLength));
|
||||||
|
}
|
||||||
|
body.setStringValue(bodyText.toString());
|
||||||
|
id.setStringValue(Integer.toString(i));
|
||||||
|
iw.addDocument(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexReader ir = iw.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(ir);
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
checkDocument(searcher, i, maxTopN);
|
||||||
|
}
|
||||||
|
iw.close();
|
||||||
|
ir.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkDocument(IndexSearcher is, int doc, int maxTopN) throws IOException {
|
||||||
|
for (int ch = 'a'; ch <= 'z'; ch++) {
|
||||||
|
Term term = new Term("body", "" + (char)ch);
|
||||||
|
// check a simple term query
|
||||||
|
checkQuery(is, new TermQuery(term), doc, maxTopN);
|
||||||
|
// check a boolean query
|
||||||
|
BooleanQuery bq = new BooleanQuery();
|
||||||
|
bq.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
|
||||||
|
Term nextTerm = new Term("body", "" + (char)(ch+1));
|
||||||
|
bq.add(new TermQuery(nextTerm), BooleanClause.Occur.SHOULD);
|
||||||
|
checkQuery(is, bq, doc, maxTopN);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException {
|
||||||
|
for (int n = 1; n < maxTopN; n++) {
|
||||||
|
FakePassageFormatter f1 = new FakePassageFormatter();
|
||||||
|
PostingsHighlighter p1 = new PostingsHighlighter("body",
|
||||||
|
Integer.MAX_VALUE-1,
|
||||||
|
BreakIterator.getSentenceInstance(Locale.ROOT),
|
||||||
|
new PassageScorer(),
|
||||||
|
f1);
|
||||||
|
FakePassageFormatter f2 = new FakePassageFormatter();
|
||||||
|
PostingsHighlighter p2 = new PostingsHighlighter("body",
|
||||||
|
Integer.MAX_VALUE-1,
|
||||||
|
BreakIterator.getSentenceInstance(Locale.ROOT),
|
||||||
|
new PassageScorer(),
|
||||||
|
f2);
|
||||||
|
BooleanQuery bq = new BooleanQuery(false);
|
||||||
|
bq.add(query, BooleanClause.Occur.MUST);
|
||||||
|
bq.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST);
|
||||||
|
TopDocs td = is.search(bq, 1);
|
||||||
|
p1.highlight(bq, is, td, n);
|
||||||
|
p2.highlight(bq, is, td, n+1);
|
||||||
|
assertTrue(f2.seen.containsAll(f1.seen));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* returns a new random sentence, up to maxSentenceLength "words" in length.
|
||||||
|
* each word is a single character (a-z). The first one is capitalized.
|
||||||
|
*/
|
||||||
|
private String newSentence(Random r, int maxSentenceLength) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
int numElements = _TestUtil.nextInt(r, 1, maxSentenceLength);
|
||||||
|
for (int i = 0; i < numElements; i++) {
|
||||||
|
if (sb.length() > 0) {
|
||||||
|
sb.append(' ');
|
||||||
|
sb.append((char)_TestUtil.nextInt(r, 'a', 'z'));
|
||||||
|
} else {
|
||||||
|
// capitalize the first word to help breakiterator
|
||||||
|
sb.append((char)_TestUtil.nextInt(r, 'A', 'Z'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sb.append(". "); // finalize sentence
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* a fake formatter that doesn't actually format passages.
|
||||||
|
* instead it just collects them for asserts!
|
||||||
|
*/
|
||||||
|
static class FakePassageFormatter extends PassageFormatter {
|
||||||
|
HashSet<Pair> seen = new HashSet<Pair>();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String format(Passage passages[], String content) {
|
||||||
|
for (Passage p : passages) {
|
||||||
|
// verify some basics about the passage
|
||||||
|
assertTrue(p.getScore() >= 0);
|
||||||
|
assertTrue(p.getNumMatches() > 0);
|
||||||
|
assertTrue(p.getStartOffset() >= 0);
|
||||||
|
assertTrue(p.getStartOffset() <= content.length());
|
||||||
|
// we use a very simple analyzer. so we can assert the matches are correct
|
||||||
|
for (int i = 0; i < p.getNumMatches(); i++) {
|
||||||
|
Term term = p.getMatchTerms()[i];
|
||||||
|
assertEquals("body", term.field());
|
||||||
|
int matchStart = p.getMatchStarts()[i];
|
||||||
|
assertTrue(matchStart >= 0);
|
||||||
|
int matchEnd = p.getMatchEnds()[i];
|
||||||
|
assertTrue(matchEnd >= 0);
|
||||||
|
// single character terms
|
||||||
|
assertEquals(matchStart+1, matchEnd);
|
||||||
|
// and the offsets must be correct...
|
||||||
|
BytesRef bytes = term.bytes();
|
||||||
|
assertEquals(1, bytes.length);
|
||||||
|
assertEquals((char)bytes.bytes[bytes.offset], Character.toLowerCase(content.charAt(matchStart)));
|
||||||
|
}
|
||||||
|
// record just the start/end offset for simplicity
|
||||||
|
seen.add(new Pair(p.getStartOffset(), p.getEndOffset()));
|
||||||
|
}
|
||||||
|
return "bogus!!!!!!";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static class Pair {
|
||||||
|
final int start;
|
||||||
|
final int end;
|
||||||
|
|
||||||
|
Pair(int start, int end) {
|
||||||
|
this.start = start;
|
||||||
|
this.end = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
final int prime = 31;
|
||||||
|
int result = 1;
|
||||||
|
result = prime * result + end;
|
||||||
|
result = prime * result + start;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (obj == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (getClass() != obj.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
Pair other = (Pair) obj;
|
||||||
|
if (end != other.end) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (start != other.start) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "Pair [start=" + start + ", end=" + end + "]";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue