LUCENE-7438: Renovate benchmark module's support for highlighting

This commit is contained in:
David Smiley 2016-10-07 09:57:11 -04:00
parent 6aa28bd655
commit 5ef60af9c1
20 changed files with 360 additions and 709 deletions

View File

@ -192,6 +192,8 @@
// excludes: // excludes:
exclude(name: '**/build/**') exclude(name: '**/build/**')
exclude(name: '**/dist/**') exclude(name: '**/dist/**')
exclude(name: 'lucene/benchmark/work/**')
exclude(name: 'lucene/benchmark/temp/**')
exclude(name: '**/CheckLoggingConfiguration.java') exclude(name: '**/CheckLoggingConfiguration.java')
exclude(name: 'build.xml') // ourselves :-) exclude(name: 'build.xml') // ourselves :-)
} }

View File

@ -76,6 +76,9 @@ Other
* LUCENE-7452: Block join query exception suggests how to find a doc, which * LUCENE-7452: Block join query exception suggests how to find a doc, which
violates orthogonality requirement. (Mikhail Khludnev) violates orthogonality requirement. (Mikhail Khludnev)
* LUCENE-7438: Renovate the Benchmark module's support for benchmarking highlighting. All
highlighters are supported via SearchTravRetHighlight. (David Smiley)
Build Build
* LUCENE-7292: Fix build to use "--release 8" instead of "-release 8" on * LUCENE-7292: Fix build to use "--release 8" instead of "-release 8" on

View File

@ -1,2 +1,2 @@
temp/ /temp
work/ /work

View File

@ -13,10 +13,13 @@ writing, there is a page file in
http://download.wikimedia.org/enwiki/20070402/. You can download this http://download.wikimedia.org/enwiki/20070402/. You can download this
file manually and put it in temp. Note that the file you download will file manually and put it in temp. Note that the file you download will
probably have the date in the name, e.g., probably have the date in the name, e.g.,
http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2.
you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2.
If you use the EnwikiContentSource then the data will be decompressed on the fly
during the benchmark. If you want to benchmark indexing, you should probably decompress
it beforehand using the "enwiki" Ant target which will produce a work/enwiki.txt, after
which you can use LineDocSource in your benchmark.
After that, ant enwiki should process the data set and run a load After that, ant enwiki should process the data set and run a load
test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can test. Ant target enwiki will download, decompress, and extract (to individual files
also be used to download, decompress, and extract (to individual files
in work/enwiki) the dataset, respectively. in work/enwiki) the dataset, respectively.

View File

@ -1,80 +0,0 @@
#/**
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
# -------------------------------------------------------------------------------------
ram.flush.mb=flush:32:32
compound=cmpnd:true:false
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory
doc.stored=true
doc.tokenized=true
doc.term.vector=true
doc.term.vector.offsets=true
doc.term.vector.positions=true
log.step=2000
docs.dir=reuters-out
content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
docs.file=temp/enwiki-20070527-pages-articles.xml
query.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiQueryMaker
enwikiQueryMaker.disableSpanQueries=true
max.field.length=2147483647
highlighter.maxDocCharsToAnalyze=2147483647
# task at this depth or less would print when they start
task.max.depth.log=2
log.queries=true
# -------------------------------------------------------------------------------------
{ "Populate"
CreateIndex
{ "MAddDocs" AddDoc } : 20000
ForceMerge(1)
CloseIndex
}
{
OpenReader
{ "WarmTV" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 100
CloseReader
}
{
"Rounds"
ResetSystemSoft
OpenReader
{ "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 200
CloseReader
ResetSystemSoft
OpenReader
{ "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[3],fields[body]) > : 200
CloseReader
RepSumByPref Search
NewRound
} : 4
RepSumByNameRound
RepSumByName

View File

@ -14,55 +14,52 @@
# * See the License for the specific language governing permissions and # * See the License for the specific language governing permissions and
# * limitations under the License. # * limitations under the License.
# */ # */
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
ram.flush.mb=flush:32:32 # For postings-offsets with light term-vectors
compound=cmpnd:true:false
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory directory=FSDirectory
work.dir=work/enwikiPostings
ram.flush.mb=64
compound=false
doc.stored=true doc.stored=true
doc.tokenized=true doc.tokenized=true
# offsets in postings:
doc.body.offsets=true
# term vector, but no positions/offsets with it
doc.term.vector=true doc.term.vector=true
doc.term.vector.offsets=true
doc.term.vector.positions=true
log.step=2000
docs.dir=reuters-out content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
file.query.maker.file=conf/query-phrases.txt
log.queries=false
log.step.SearchTravRetHighlight=-1
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
# task at this depth or less would print when they start
task.max.depth.log=2
log.queries=true
# -------------------------------------------------------------------------------------
{ "Populate" { "Populate"
CreateIndex CreateIndex
{ "MAddDocs" AddDoc } : 20000 [{ "MAddDocs" AddDoc > : 50000] : 4
ForceMerge(1)
CloseIndex CloseIndex
} } : 0
{ "Rounds"
{
"Rounds"
ResetSystemSoft ResetSystemSoft
OpenReader OpenReader
{ "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000
{ "Warm" SearchTravRetHighlight > : 1000
{ "HL" SearchTravRetHighlight > : 500
CloseReader CloseReader
RepSumByPref MAddDocs
NewRound NewRound
} : 6
} : 4 RepSumByPrefRound HL
RepSumByNameRound
RepSumByName
RepSumByPrefRound MAddDocs

View File

@ -14,55 +14,51 @@
# * See the License for the specific language governing permissions and # * See the License for the specific language governing permissions and
# * limitations under the License. # * limitations under the License.
# */ # */
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
ram.flush.mb=flush:32:32 # This is a full-term vector configuration.
compound=cmpnd:true:false
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory directory=FSDirectory
work.dir=work/enwikiTermVec
ram.flush.mb=64
compound=false
doc.stored=true doc.stored=true
doc.tokenized=true doc.tokenized=true
doc.term.vector=true doc.term.vector=true
doc.term.vector.offsets=true
doc.term.vector.positions=true doc.term.vector.positions=true
log.step=2000 doc.term.vector.offsets=true
docs.dir=reuters-out content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
file.query.maker.file=conf/query-terms.txt
log.queries=false
log.step.SearchTravRetHighlight=-1
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V
# task at this depth or less would print when they start
task.max.depth.log=2
log.queries=true
# -------------------------------------------------------------------------------------
{ "Populate" { "Populate"
CreateIndex CreateIndex
{ "MAddDocs" AddDoc } : 20000 [{ "MAddDocs" AddDoc > : 50000] : 4
ForceMerge(1)
CloseIndex CloseIndex
} } : 0
{ "Rounds"
{
"Rounds"
ResetSystemSoft ResetSystemSoft
OpenReader OpenReader
{ "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000
{ "Warm" SearchTravRetHighlight > : 1000
{ "HL" SearchTravRetHighlight > : 500
CloseReader CloseReader
RepSumByPref MAddDocs
NewRound NewRound
} : 4 } : 4
RepSumByNameRound RepSumByPrefRound HL
RepSumByName
RepSumByPrefRound MAddDocs

View File

@ -54,7 +54,7 @@ log.queries=true
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000 { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
CloseReader CloseReader
OpenReader OpenReader
{ "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000 { "SearchHlgtSameRdr" SearchTravRetHighlight(type[UH]) > : 1000
CloseReader CloseReader

View File

@ -0,0 +1,10 @@
"Abraham Lincoln"
"Union Wisconsin"
"court of law"
"Field Theory" OR "Set Theory"
"Top 100"
"red hot chili"
"greatest guitarists"
"Planes, Trains & Automobiles" OR ships
"international airport"
"Xbox 360"

View File

@ -0,0 +1,10 @@
Abraham AND Lincoln
Union AND Wisconsin
court AND law
top AND 100
(field OR set) AND theory
red AND hot AND chili
greatest AND guitarists
(planes AND trains AND automobiles) OR ships
international AND airport
xbox AND 360

View File

@ -0,0 +1,7 @@
abrah* AND linc*
court* AND law*
(field OR set) AND theor*
red AND hot AND chili*
great* AND guitar*
(plan* AND train* AND automob*) OR ship*
international AND airport*

View File

@ -1,69 +0,0 @@
#/**
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
ram.flush.mb=flush:32:32
compound=cmpnd:true:false
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory
doc.stored=true
doc.tokenized=true
doc.term.vector=true
doc.term.vector.offsets=true
doc.term.vector.positions=true
log.step=2000
docs.dir=reuters-out
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
# task at this depth or less would print when they start
task.max.depth.log=2
log.queries=true
# -------------------------------------------------------------------------------------
{ "Populate"
CreateIndex
{ "MAddDocs" AddDoc } : 20000
ForceMerge(1)
CloseIndex
}
{ "Rounds"
ResetSystemSoft
OpenReader
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
CloseReader
OpenReader
{ "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
CloseReader
RepSumByPref SearchHlgtSameRdr
NewRound
} : 2
RepSumByNameRound
RepSumByName
RepSumByPrefRound MAddDocs

View File

@ -349,6 +349,8 @@ public class PerfRunData implements Closeable {
// Hold reference to new IR // Hold reference to new IR
indexReader.incRef(); indexReader.incRef();
indexSearcher = new IndexSearcher(indexReader); indexSearcher = new IndexSearcher(indexReader);
// TODO Some day we should make the query cache in this module configurable and control clearing the cache
indexSearcher.setQueryCache(null);
} else { } else {
indexSearcher = null; indexSearcher = null;
} }

View File

@ -43,6 +43,7 @@ import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField; import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
/** /**
* Creates {@link Document} objects. Uses a {@link ContentSource} to generate * Creates {@link Document} objects. Uses a {@link ContentSource} to generate
@ -58,6 +59,8 @@ import org.apache.lucene.document.TextField;
* (default <b>true</b>). * (default <b>true</b>).
* <li><b>doc.body.tokenized</b> - specifies whether the * <li><b>doc.body.tokenized</b> - specifies whether the
* body field should be tokenized (default = <b>doc.tokenized</b>). * body field should be tokenized (default = <b>doc.tokenized</b>).
* <li><b>doc.body.offsets</b> - specifies whether to add offsets into the postings index
* for the body field. It is useful for highlighting. (default <b>false</b>)
* <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in * <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in
* the index or not. (default <b>false</b>). * the index or not. (default <b>false</b>).
* <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be * <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be
@ -424,6 +427,7 @@ public class DocMaker implements Closeable {
boolean bodyTokenized = config.get("doc.body.tokenized", tokenized); boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
boolean norms = config.get("doc.tokenized.norms", false); boolean norms = config.get("doc.tokenized.norms", false);
boolean bodyNorms = config.get("doc.body.tokenized.norms", true); boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
boolean bodyOffsets = config.get("doc.body.offsets", false);
boolean termVec = config.get("doc.term.vector", false); boolean termVec = config.get("doc.term.vector", false);
boolean termVecPositions = config.get("doc.term.vector.positions", false); boolean termVecPositions = config.get("doc.term.vector.positions", false);
boolean termVecOffsets = config.get("doc.term.vector.offsets", false); boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
@ -441,6 +445,9 @@ public class DocMaker implements Closeable {
bodyValType.setStored(bodyStored); bodyValType.setStored(bodyStored);
bodyValType.setTokenized(bodyTokenized); bodyValType.setTokenized(bodyTokenized);
bodyValType.setOmitNorms(!bodyNorms); bodyValType.setOmitNorms(!bodyNorms);
if (bodyTokenized && bodyOffsets) {
bodyValType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
}
bodyValType.setStoreTermVectors(termVec); bodyValType.setStoreTermVectors(termVec);
bodyValType.setStoreTermVectorPositions(termVecPositions); bodyValType.setStoreTermVectorPositions(termVecPositions);
bodyValType.setStoreTermVectorOffsets(termVecOffsets); bodyValType.setStoreTermVectorOffsets(termVecOffsets);

View File

@ -1,30 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask.tasks;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
/**
* Abstract class for benchmarking highlighting performance
*/
public abstract class BenchmarkHighlighter {
public abstract int doHighlight( IndexReader reader, int doc, String field,
Document document, Analyzer analyzer, String text ) throws Exception ;
}

View File

@ -75,7 +75,7 @@ public abstract class ReadTask extends PerfTask {
int res = 0; int res = 0;
// open reader or use existing one // open reader or use existing one
IndexSearcher searcher = getRunData().getIndexSearcher(); IndexSearcher searcher = getRunData().getIndexSearcher(); // (will incRef the reader)
IndexReader reader; IndexReader reader;
@ -132,8 +132,9 @@ public abstract class ReadTask extends PerfTask {
//hits = collector.topDocs(); //hits = collector.topDocs();
} }
if (hits != null) {
final String printHitsField = getRunData().getConfig().get("print.hits.field", null); final String printHitsField = getRunData().getConfig().get("print.hits.field", null);
if (hits != null && printHitsField != null && printHitsField.length() > 0) { if (printHitsField != null && printHitsField.length() > 0) {
System.out.println("totalHits = " + hits.totalHits); System.out.println("totalHits = " + hits.totalHits);
System.out.println("maxDoc() = " + reader.maxDoc()); System.out.println("maxDoc() = " + reader.maxDoc());
System.out.println("numDocs() = " + reader.numDocs()); System.out.println("numDocs() = " + reader.numDocs());
@ -144,34 +145,7 @@ public abstract class ReadTask extends PerfTask {
} }
} }
if (withTraverse()) { res += withTopDocs(searcher, q, hits);
final ScoreDoc[] scoreDocs = hits.scoreDocs;
int traversalSize = Math.min(scoreDocs.length, traversalSize());
if (traversalSize > 0) {
boolean retrieve = withRetrieve();
int numHighlight = Math.min(numToHighlight(), scoreDocs.length);
Analyzer analyzer = getRunData().getAnalyzer();
BenchmarkHighlighter highlighter = null;
if (numHighlight > 0) {
highlighter = getBenchmarkHighlighter(q);
}
for (int m = 0; m < traversalSize; m++) {
int id = scoreDocs[m].doc;
res++;
if (retrieve) {
Document document = retrieveDoc(reader, id);
res += document != null ? 1 : 0;
if (numHighlight > 0 && m < numHighlight) {
Collection<String> fieldsToHighlight = getFieldsToHighlight(document);
for (final String field : fieldsToHighlight) {
String text = document.get(field);
res += highlighter.doHighlight(reader, id, field, document, analyzer, text);
}
}
}
}
}
} }
} }
} }
@ -185,6 +159,28 @@ public abstract class ReadTask extends PerfTask {
return res; return res;
} }
protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
IndexReader reader = searcher.getIndexReader();
int res = 0;
if (withTraverse()) {
final ScoreDoc[] scoreDocs = hits.scoreDocs;
int traversalSize = Math.min(scoreDocs.length, traversalSize());
if (traversalSize > 0) {
boolean retrieve = withRetrieve();
for (int m = 0; m < traversalSize; m++) {
int id = scoreDocs[m].doc;
res++;
if (retrieve) {
Document document = retrieveDoc(reader, id);
res += document != null ? 1 : 0;
}
}
}
}
return res;
}
protected Collector createCollector() throws Exception { protected Collector createCollector() throws Exception {
return TopScoreDocCollector.create(numHits()); return TopScoreDocCollector.create(numHits());
} }
@ -267,39 +263,8 @@ public abstract class ReadTask extends PerfTask {
*/ */
public abstract boolean withRetrieve(); public abstract boolean withRetrieve();
/**
* Set to the number of documents to highlight.
*
* @return The number of the results to highlight. O means no docs will be highlighted.
*/
public int numToHighlight() {
return 0;
}
/**
* Return an appropriate highlighter to be used with
* highlighting tasks
*/
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
return null;
}
protected Sort getSort() { protected Sort getSort() {
return null; return null;
} }
/**
* Define the fields to highlight. Base implementation returns all fields
* @param document The Document
* @return A Collection of Field names (Strings)
*/
protected Collection<String> getFieldsToHighlight(Document document) {
List<IndexableField> fields = document.getFields();
Set<String> result = new HashSet<>(fields.size());
for (final IndexableField f : fields) {
result.add(f.name());
}
return result;
}
} }

View File

@ -14,65 +14,98 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.benchmark.byTask.tasks; package org.apache.lucene.benchmark.byTask.tasks;
import java.text.BreakIterator;
import java.util.Collection; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.HashSet; import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources; import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.WeightedFragListBuilder;
import org.apache.lucene.util.ArrayUtil;
/** /**
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents. * Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
* *
* Uses the {@link org.apache.lucene.search.highlight.SimpleHTMLFormatter} for formatting.
*
* <p>Note: This task reuses the reader if it is already open. * <p>Note: This task reuses the reader if it is already open.
* Otherwise a reader is opened at start and closed at the end. * Otherwise a reader is opened at start and closed at the end.
* </p> * </p>
* *
* <p>Takes optional multivalued, comma separated param string as: size[&lt;traversal size&gt;],highlight[&lt;int&gt;],maxFrags[&lt;int&gt;],mergeContiguous[&lt;boolean&gt;],fields[name1;name2;...]</p> * <p>Takes optional multivalued, comma separated param string as: type[&lt;enum&gt;],maxFrags[&lt;int&gt;],fields[name1;name2;...]</p>
* <ul> * <ul>
* <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li> * <li>type - the highlighter implementation, e.g. "UH"</li>
* <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
* <li>maxFrags - The maximum number of fragments to score by the highlighter</li> * <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
* <li>mergeContiguous - true if contiguous fragments should be merged.</li>
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li> * <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
* </ul> * </ul>
* Example: * Example:
* <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) &gt; : 1000 * <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(type[UH],maxFrags[3],fields[body]) &gt; : 1000
* </pre> * </pre>
* *
* Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well. * Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well,
* and offsets in postings is another option.
* *
* <p>Other side effects: counts additional 1 (record) for each traversed hit, * <p>Other side effects: counts additional 1 (record) for each traversed hit,
* and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p> * and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
*/ */
public class SearchTravRetHighlightTask extends SearchTravTask { public class SearchTravRetHighlightTask extends SearchTravTask {
private int maxDocCharsToAnalyze; // max leading content chars to highlight
protected int numToHighlight = Integer.MAX_VALUE; private int maxFrags = 1; // aka passages
protected boolean mergeContiguous; private Set<String> hlFields = Collections.singleton("body");
protected int maxFrags = 2; private String type;
protected Set<String> paramFields = Collections.emptySet(); private HLImpl hlImpl;
protected Highlighter highlighter; private Analyzer analyzer;
protected int maxDocCharsToAnalyze;
public SearchTravRetHighlightTask(PerfRunData runData) { public SearchTravRetHighlightTask(PerfRunData runData) {
super(runData); super(runData);
} }
@Override
public void setParams(String params) {
// can't call super because super doesn't understand our params syntax
this.params = params;
// TODO consider instead using data.getConfig().get("highlighter.*")?
String[] splits = params.split(",");
for (String split : splits) {
if (split.startsWith("type[") == true) {
type = split.substring("type[".length(), split.length() - 1);
} else if (split.startsWith("maxFrags[") == true) {
maxFrags = (int) Float.parseFloat(split.substring("maxFrags[".length(), split.length() - 1));
} else if (split.startsWith("fields[") == true) {
String fieldNames = split.substring("fields[".length(), split.length() - 1);
String[] fieldSplits = fieldNames.split(";");
hlFields = new HashSet<>(Arrays.asList(fieldSplits));
}
}
}
@Override @Override
public void setup() throws Exception { public void setup() throws Exception {
super.setup(); super.setup();
@ -82,72 +115,188 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
throw new Exception("doc.stored must be set to true"); throw new Exception("doc.stored must be set to true");
} }
maxDocCharsToAnalyze = data.getConfig().get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); maxDocCharsToAnalyze = data.getConfig().get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
analyzer = data.getAnalyzer();
String type = this.type;
if (type == null) {
type = data.getConfig().get("highlighter", null);
}
switch (type) {
case "NONE": hlImpl = new NoHLImpl(); break;
case "SH_A": hlImpl = new StandardHLImpl(false); break;
case "SH_V": hlImpl = new StandardHLImpl(true); break;
case "FVH_V": hlImpl = new FastVectorHLImpl(); break;
case "UH": hlImpl = new UnifiedHLImpl(null); break;
case "UH_A": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.ANALYSIS); break;
case "UH_V": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.TERM_VECTORS); break;
case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break;
case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break;
case "PH_P": hlImpl = new PostingsHLImpl(); break;
default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
}
} }
// here is where we intercept ReadTask's logic to do the highlighting, and nothing else (no retrieval of all field vals)
@Override @Override
public boolean withRetrieve() { protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
return true; hlImpl.withTopDocs(searcher, q, hits);
// note: it'd be nice if we knew the sum kilobytes of text across these hits so we could return that. It'd be a more
// useful number to gauge the amount of work. But given "average" document sizes and lots of queries, returning the
// number of docs is reasonable.
return hits.scoreDocs.length; // always return # scored docs.
} }
@Override private interface HLImpl {
public int numToHighlight() { void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception;
return numToHighlight;
} }
@Override private volatile int preventOptimizeAway = 0;
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); private class StandardHLImpl implements HLImpl {
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>");
DefaultEncoder encoder = new DefaultEncoder();
Highlighter highlighter = new Highlighter(formatter, encoder, null);
boolean termVecs;
StandardHLImpl(boolean termVecs) {
highlighter.setEncoder(new DefaultEncoder());
highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
return new BenchmarkHighlighter(){ this.termVecs = termVecs;
}
@Override @Override
public int doHighlight(IndexReader reader, int doc, String field, public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
Document document, Analyzer analyzer, String text) throws Exception { IndexReader reader = searcher.getIndexReader();
final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1; highlighter.setFragmentScorer(new QueryScorer(q));
TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset); // highlighter.setTextFragmenter(); unfortunately no sentence mechanism, not even regex. Default here is trivial
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags); for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
return frag != null ? frag.length : 0; Document document = reader.document(scoreDoc.doc, hlFields);
Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null;
for (IndexableField indexableField : document) {
TokenStream tokenStream;
if (termVecs) {
tokenStream = TokenSources.getTokenStream(indexableField.name(), tvFields,
indexableField.stringValue(), analyzer, maxDocCharsToAnalyze);
} else {
tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue());
}
// will close TokenStream:
String[] fragments = highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags);
preventOptimizeAway = fragments.length;
}
}
}
}
private class FastVectorHLImpl implements HLImpl {
int fragSize = 100;
WeightedFragListBuilder fragListBuilder = new WeightedFragListBuilder();
BoundaryScanner bs = new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ENGLISH));
ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(bs);
String[] preTags = {"<em>"};
String[] postTags = {"</em>"};
Encoder encoder = new DefaultEncoder();// new SimpleHTMLEncoder();
FastVectorHighlighter highlighter = new FastVectorHighlighter(
true, // phraseHighlight
false); // requireFieldMatch -- not pertinent to our benchmark
@Override
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
IndexReader reader = searcher.getIndexReader();
final FieldQuery fq = highlighter.getFieldQuery( q, reader);
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
for (String hlField : hlFields) {
String[] fragments = highlighter.getBestFragments(fq, reader, scoreDoc.doc, hlField, fragSize, maxFrags,
fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
preventOptimizeAway = fragments.length;
}
}
}
}
private ScoreDoc[] docIdOrder(ScoreDoc[] scoreDocs) {
ScoreDoc[] clone = new ScoreDoc[scoreDocs.length];
System.arraycopy(scoreDocs, 0, clone, 0, scoreDocs.length);
ArrayUtil.introSort(clone, (a, b) -> Integer.compare(a.doc, b.doc));
return clone;
}
private class PostingsHLImpl implements HLImpl {
PostingsHighlighter highlighter;
String[] fields = hlFields.toArray(new String[hlFields.size()]);
int[] maxPassages;
PostingsHLImpl() {
highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) {
@Override
protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards
return analyzer;
}
@Override
protected BreakIterator getBreakIterator(String field) {
return BreakIterator.getSentenceInstance(Locale.ENGLISH);
} }
}; };
maxPassages = new int[hlFields.size()];
Arrays.fill(maxPassages, maxFrags);
} }
@Override @Override
protected Collection<String> getFieldsToHighlight(Document document) { public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
Collection<String> result = super.getFieldsToHighlight(document); Map<String, String[]> result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages);
//if stored is false, then result will be empty, in which case just get all the param fields preventOptimizeAway = result.size();
if (paramFields.isEmpty() == false && result.isEmpty() == false) {
result.retainAll(paramFields);
} else {
result = paramFields;
} }
return result; }
private class UnifiedHLImpl implements HLImpl {
UnifiedHighlighter highlighter;
IndexSearcher lastSearcher;
UnifiedHighlighter.OffsetSource offsetSource; // null means auto select
String[] fields = hlFields.toArray(new String[hlFields.size()]);
int[] maxPassages;
UnifiedHLImpl(final UnifiedHighlighter.OffsetSource offsetSource) {
this.offsetSource = offsetSource;
maxPassages = new int[hlFields.size()];
Arrays.fill(maxPassages, maxFrags);
}
private void reset(IndexSearcher searcher) {
if (lastSearcher == searcher) {
return;
}
lastSearcher = searcher;
highlighter = new UnifiedHighlighter(searcher, analyzer) {
@Override
protected OffsetSource getOffsetSource(String field) {
return offsetSource != null ? offsetSource : super.getOffsetSource(field);
}
};
highlighter.setBreakIterator(() -> BreakIterator.getSentenceInstance(Locale.ENGLISH));
highlighter.setMaxLength(maxDocCharsToAnalyze);
highlighter.setHighlightPhrasesStrictly(true);
highlighter.setHandleMultiTermQuery(true);
} }
@Override @Override
public void setParams(String params) { public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
// can't call super because super doesn't understand our reset(searcher);
// params syntax Map<String, String[]> result = highlighter.highlightFields(fields, q, hits, maxPassages);
this.params = params; preventOptimizeAway = result.size();
String [] splits = params.split(",");
for (int i = 0; i < splits.length; i++) {
if (splits[i].startsWith("size[") == true){
traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
} else if (splits[i].startsWith("highlight[") == true){
numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
} else if (splits[i].startsWith("maxFrags[") == true){
maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
} else if (splits[i].startsWith("mergeContiguous[") == true){
mergeContiguous = Boolean.valueOf(splits[i].substring("mergeContiguous[".length(),splits[i].length() - 1)).booleanValue();
} else if (splits[i].startsWith("fields[") == true){
paramFields = new HashSet<>();
String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
String [] fieldSplits = fieldNames.split(";");
for (int j = 0; j < fieldSplits.length; j++) {
paramFields.add(fieldSplits[j]);
}
}
} }
} }
private class NoHLImpl implements HLImpl {
@Override
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
//just retrieve the HL fields
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
preventOptimizeAway += searcher.doc(scoreDoc.doc, hlFields).iterator().hasNext() ? 2 : 1;
}
}
}
} }

View File

@ -1,147 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask.tasks;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import java.util.Set;
import java.util.Collection;
import java.util.HashSet;
import java.util.Collections;
/**
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents by using FastVectorHighlighter.
*
* <p>Note: This task reuses the reader if it is already open.
* Otherwise a reader is opened at start and closed at the end.
* </p>
*
* <p>Takes optional multivalued, comma separated param string as: size[&lt;traversal size&gt;],highlight[&lt;int&gt;],maxFrags[&lt;int&gt;],mergeContiguous[&lt;boolean&gt;],fields[name1;name2;...]</p>
* <ul>
* <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
* <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
* <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
* <li>fragSize - The length of fragments</li>
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
* </ul>
* Example:
* <pre>"SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(size[10],highlight[10],maxFrags[3],fields[body]) &gt; : 1000
* </pre>
*
* Fields must be stored and term vector offsets and positions in order must be true for this task to work.
*
* <p>Other side effects: counts additional 1 (record) for each traversed hit,
* and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
*/
public class SearchTravRetVectorHighlightTask extends SearchTravTask {
protected int numToHighlight = Integer.MAX_VALUE;
protected int maxFrags = 2;
protected int fragSize = 100;
protected Set<String> paramFields = Collections.emptySet();
protected FastVectorHighlighter highlighter;
public SearchTravRetVectorHighlightTask(PerfRunData runData) {
super(runData);
}
@Override
public void setup() throws Exception {
super.setup();
//check to make sure either the doc is being stored
PerfRunData data = getRunData();
if (data.getConfig().get("doc.stored", false) == false){
throw new Exception("doc.stored must be set to true");
}
if (data.getConfig().get("doc.term.vector.offsets", false) == false){
throw new Exception("doc.term.vector.offsets must be set to true");
}
if (data.getConfig().get("doc.term.vector.positions", false) == false){
throw new Exception("doc.term.vector.positions must be set to true");
}
}
@Override
public boolean withRetrieve() {
return true;
}
@Override
public int numToHighlight() {
return numToHighlight;
}
@Override
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
highlighter = new FastVectorHighlighter( false, false );
final Query myq = q;
return new BenchmarkHighlighter(){
@Override
public int doHighlight(IndexReader reader, int doc, String field,
Document document, Analyzer analyzer, String text) throws Exception {
final FieldQuery fq = highlighter.getFieldQuery( myq, reader);
String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags);
return fragments != null ? fragments.length : 0;
}
};
}
@Override
protected Collection<String> getFieldsToHighlight(Document document) {
Collection<String> result = super.getFieldsToHighlight(document);
//if stored is false, then result will be empty, in which case just get all the param fields
if (paramFields.isEmpty() == false && result.isEmpty() == false) {
result.retainAll(paramFields);
} else {
result = paramFields;
}
return result;
}
@Override
public void setParams(String params) {
// can't call super because super doesn't understand our
// params syntax
final String [] splits = params.split(",");
for (int i = 0; i < splits.length; i++) {
if (splits[i].startsWith("size[") == true){
traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
} else if (splits[i].startsWith("highlight[") == true){
numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
} else if (splits[i].startsWith("maxFrags[") == true){
maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
} else if (splits[i].startsWith("fragSize[") == true){
fragSize = (int)Float.parseFloat(splits[i].substring("fragSize[".length(),splits[i].length() - 1));
} else if (splits[i].startsWith("fields[") == true){
paramFields = new HashSet<>();
String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
String [] fieldSplits = fieldNames.split(";");
for (int j = 0; j < fieldSplits.length; j++) {
paramFields.add(fieldSplits[j]);
}
}
}
}
}

View File

@ -31,9 +31,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
import org.apache.lucene.benchmark.byTask.stats.TaskStats; import org.apache.lucene.benchmark.byTask.stats.TaskStats;
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.collation.CollationKeyAnalyzer; import org.apache.lucene.collation.CollationKeyAnalyzer;
@ -159,110 +157,6 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
//assertTrue(CountingSearchTestTask.numSearches > 0); //assertTrue(CountingSearchTestTask.numSearches > 0);
} }
public void testHighlighting() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"doc.stored=true",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"query.maker=" + ReutersQueryMaker.class.getName(),
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 100",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
"CloseReader",
};
// 2. we test this value later
CountingHighlighterTestTask.numHighlightedResults = 0;
CountingHighlighterTestTask.numDocsRetrieved = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
//we probably should use a different doc/query maker, but...
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
iw.close();
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs());
ir.close();
}
public void testHighlightingTV() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"doc.stored=true",//doc storage is required in order to have text to highlight
"doc.term.vector=true",
"doc.term.vector.offsets=true",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"query.maker=" + ReutersQueryMaker.class.getName(),
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
"CloseReader",
};
// 2. we test this value later
CountingHighlighterTestTask.numHighlightedResults = 0;
CountingHighlighterTestTask.numDocsRetrieved = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
//we probably should use a different doc/query maker, but...
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
iw.close();
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
ir.close();
}
public void testHighlightingNoTvNoStore() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"doc.stored=false",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
"query.maker=" + ReutersQueryMaker.class.getName(),
"ResetSystemErase",
"CreateIndex",
"{ AddDoc } : 1000",
"ForceMerge(1)",
"CloseIndex",
"OpenReader",
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
"CloseReader",
};
// 2. we test this value later
CountingHighlighterTestTask.numHighlightedResults = 0;
CountingHighlighterTestTask.numDocsRetrieved = 0;
// 3. execute the algorithm (required in every "logic" test)
expectThrows(Exception.class, () -> {
execBenchmark(algLines);
});
}
/** /**
* Test Exhasting Doc Maker logic * Test Exhasting Doc Maker logic
*/ */

View File

@ -1,68 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask.tasks;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
/**
* Test Search task which counts number of searches.
*/
public class CountingHighlighterTestTask extends SearchTravRetHighlightTask {
public static int numHighlightedResults = 0;
public static int numDocsRetrieved = 0;
public CountingHighlighterTestTask(PerfRunData runData) {
super(runData);
}
@Override
protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
Document document = ir.document(id);
if (document != null) {
numDocsRetrieved++;
}
return document;
}
@Override
public BenchmarkHighlighter getBenchmarkHighlighter(Query q) {
highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
return new BenchmarkHighlighter() {
@Override
public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception {
final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
numHighlightedResults += frag != null ? frag.length : 0;
return frag != null ? frag.length : 0;
}
};
}
}