mirror of https://github.com/apache/lucene.git
LUCENE-7438: Renovate benchmark module's support for highlighting
This commit is contained in:
parent
6aa28bd655
commit
5ef60af9c1
|
@ -192,6 +192,8 @@
|
|||
// excludes:
|
||||
exclude(name: '**/build/**')
|
||||
exclude(name: '**/dist/**')
|
||||
exclude(name: 'lucene/benchmark/work/**')
|
||||
exclude(name: 'lucene/benchmark/temp/**')
|
||||
exclude(name: '**/CheckLoggingConfiguration.java')
|
||||
exclude(name: 'build.xml') // ourselves :-)
|
||||
}
|
||||
|
|
|
@ -76,6 +76,9 @@ Other
|
|||
* LUCENE-7452: Block join query exception suggests how to find a doc, which
|
||||
violates orthogonality requirement. (Mikhail Khludnev)
|
||||
|
||||
* LUCENE-7438: Renovate the Benchmark module's support for benchmarking highlighting. All
|
||||
highlighters are supported via SearchTravRetHighlight. (David Smiley)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-7292: Fix build to use "--release 8" instead of "-release 8" on
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
temp/
|
||||
work/
|
||||
/temp
|
||||
/work
|
|
@ -13,10 +13,13 @@ writing, there is a page file in
|
|||
http://download.wikimedia.org/enwiki/20070402/. You can download this
|
||||
file manually and put it in temp. Note that the file you download will
|
||||
probably have the date in the name, e.g.,
|
||||
http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When
|
||||
you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2.
|
||||
http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2.
|
||||
|
||||
If you use the EnwikiContentSource then the data will be decompressed on the fly
|
||||
during the benchmark. If you want to benchmark indexing, you should probably decompress
|
||||
it beforehand using the "enwiki" Ant target which will produce a work/enwiki.txt, after
|
||||
which you can use LineDocSource in your benchmark.
|
||||
|
||||
After that, ant enwiki should process the data set and run a load
|
||||
test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can
|
||||
also be used to download, decompress, and extract (to individual files
|
||||
test. Ant target enwiki will download, decompress, and extract (to individual files
|
||||
in work/enwiki) the dataset, respectively.
|
||||
|
|
|
@ -1,80 +0,0 @@
|
|||
#/**
|
||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# * contributor license agreements. See the NOTICE file distributed with
|
||||
# * this work for additional information regarding copyright ownership.
|
||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# * (the "License"); you may not use this file except in compliance with
|
||||
# * the License. You may obtain a copy of the License at
|
||||
# *
|
||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||
# *
|
||||
# * Unless required by applicable law or agreed to in writing, software
|
||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
|
||||
ram.flush.mb=flush:32:32
|
||||
compound=cmpnd:true:false
|
||||
|
||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||
directory=FSDirectory
|
||||
|
||||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=true
|
||||
doc.term.vector.offsets=true
|
||||
doc.term.vector.positions=true
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
|
||||
docs.file=temp/enwiki-20070527-pages-articles.xml
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiQueryMaker
|
||||
enwikiQueryMaker.disableSpanQueries=true
|
||||
|
||||
max.field.length=2147483647
|
||||
highlighter.maxDocCharsToAnalyze=2147483647
|
||||
|
||||
# task at this depth or less would print when they start
|
||||
task.max.depth.log=2
|
||||
|
||||
log.queries=true
|
||||
# -------------------------------------------------------------------------------------
|
||||
{ "Populate"
|
||||
CreateIndex
|
||||
{ "MAddDocs" AddDoc } : 20000
|
||||
ForceMerge(1)
|
||||
CloseIndex
|
||||
}
|
||||
{
|
||||
OpenReader
|
||||
{ "WarmTV" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 100
|
||||
CloseReader
|
||||
}
|
||||
{
|
||||
"Rounds"
|
||||
|
||||
ResetSystemSoft
|
||||
|
||||
OpenReader
|
||||
{ "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 200
|
||||
CloseReader
|
||||
|
||||
ResetSystemSoft
|
||||
|
||||
OpenReader
|
||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[3],fields[body]) > : 200
|
||||
CloseReader
|
||||
|
||||
RepSumByPref Search
|
||||
|
||||
NewRound
|
||||
} : 4
|
||||
|
||||
RepSumByNameRound
|
||||
RepSumByName
|
|
@ -14,55 +14,52 @@
|
|||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
ram.flush.mb=flush:32:32
|
||||
compound=cmpnd:true:false
|
||||
# For postings-offsets with light term-vectors
|
||||
|
||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||
directory=FSDirectory
|
||||
work.dir=work/enwikiPostings
|
||||
ram.flush.mb=64
|
||||
compound=false
|
||||
|
||||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
# offsets in postings:
|
||||
doc.body.offsets=true
|
||||
# term vector, but no positions/offsets with it
|
||||
doc.term.vector=true
|
||||
doc.term.vector.offsets=true
|
||||
doc.term.vector.positions=true
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
|
||||
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
|
||||
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
|
||||
file.query.maker.file=conf/query-phrases.txt
|
||||
log.queries=false
|
||||
log.step.SearchTravRetHighlight=-1
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
|
||||
|
||||
# task at this depth or less would print when they start
|
||||
task.max.depth.log=2
|
||||
|
||||
log.queries=true
|
||||
# -------------------------------------------------------------------------------------
|
||||
{ "Populate"
|
||||
CreateIndex
|
||||
{ "MAddDocs" AddDoc } : 20000
|
||||
ForceMerge(1)
|
||||
[{ "MAddDocs" AddDoc > : 50000] : 4
|
||||
CloseIndex
|
||||
}
|
||||
{ "Rounds"
|
||||
} : 0
|
||||
|
||||
ResetSystemSoft
|
||||
{
|
||||
"Rounds"
|
||||
|
||||
ResetSystemSoft
|
||||
|
||||
OpenReader
|
||||
{ "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000
|
||||
OpenReader
|
||||
|
||||
CloseReader
|
||||
{ "Warm" SearchTravRetHighlight > : 1000
|
||||
|
||||
RepSumByPref MAddDocs
|
||||
{ "HL" SearchTravRetHighlight > : 500
|
||||
|
||||
NewRound
|
||||
CloseReader
|
||||
|
||||
} : 4
|
||||
NewRound
|
||||
} : 6
|
||||
|
||||
RepSumByNameRound
|
||||
RepSumByName
|
||||
RepSumByPrefRound MAddDocs
|
||||
RepSumByPrefRound HL
|
|
@ -14,55 +14,51 @@
|
|||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
ram.flush.mb=flush:32:32
|
||||
compound=cmpnd:true:false
|
||||
# This is a full-term vector configuration.
|
||||
|
||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||
directory=FSDirectory
|
||||
work.dir=work/enwikiTermVec
|
||||
ram.flush.mb=64
|
||||
compound=false
|
||||
|
||||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=true
|
||||
doc.term.vector.offsets=true
|
||||
doc.term.vector.positions=true
|
||||
log.step=2000
|
||||
doc.term.vector.offsets=true
|
||||
|
||||
docs.dir=reuters-out
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
|
||||
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
|
||||
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
|
||||
file.query.maker.file=conf/query-terms.txt
|
||||
log.queries=false
|
||||
log.step.SearchTravRetHighlight=-1
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V
|
||||
|
||||
# task at this depth or less would print when they start
|
||||
task.max.depth.log=2
|
||||
|
||||
log.queries=true
|
||||
# -------------------------------------------------------------------------------------
|
||||
{ "Populate"
|
||||
CreateIndex
|
||||
{ "MAddDocs" AddDoc } : 20000
|
||||
ForceMerge(1)
|
||||
[{ "MAddDocs" AddDoc > : 50000] : 4
|
||||
CloseIndex
|
||||
}
|
||||
{ "Rounds"
|
||||
} : 0
|
||||
|
||||
ResetSystemSoft
|
||||
{
|
||||
"Rounds"
|
||||
|
||||
ResetSystemSoft
|
||||
|
||||
OpenReader
|
||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000
|
||||
OpenReader
|
||||
|
||||
CloseReader
|
||||
{ "Warm" SearchTravRetHighlight > : 1000
|
||||
|
||||
RepSumByPref MAddDocs
|
||||
{ "HL" SearchTravRetHighlight > : 500
|
||||
|
||||
NewRound
|
||||
CloseReader
|
||||
|
||||
NewRound
|
||||
} : 4
|
||||
|
||||
RepSumByNameRound
|
||||
RepSumByName
|
||||
RepSumByPrefRound MAddDocs
|
||||
RepSumByPrefRound HL
|
|
@ -54,7 +54,7 @@ log.queries=true
|
|||
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
|
||||
CloseReader
|
||||
OpenReader
|
||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
|
||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(type[UH]) > : 1000
|
||||
|
||||
CloseReader
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
"Abraham Lincoln"
|
||||
"Union Wisconsin"
|
||||
"court of law"
|
||||
"Field Theory" OR "Set Theory"
|
||||
"Top 100"
|
||||
"red hot chili"
|
||||
"greatest guitarists"
|
||||
"Planes, Trains & Automobiles" OR ships
|
||||
"international airport"
|
||||
"Xbox 360"
|
|
@ -0,0 +1,10 @@
|
|||
Abraham AND Lincoln
|
||||
Union AND Wisconsin
|
||||
court AND law
|
||||
top AND 100
|
||||
(field OR set) AND theory
|
||||
red AND hot AND chili
|
||||
greatest AND guitarists
|
||||
(planes AND trains AND automobiles) OR ships
|
||||
international AND airport
|
||||
xbox AND 360
|
|
@ -0,0 +1,7 @@
|
|||
abrah* AND linc*
|
||||
court* AND law*
|
||||
(field OR set) AND theor*
|
||||
red AND hot AND chili*
|
||||
great* AND guitar*
|
||||
(plan* AND train* AND automob*) OR ship*
|
||||
international AND airport*
|
|
@ -1,69 +0,0 @@
|
|||
#/**
|
||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# * contributor license agreements. See the NOTICE file distributed with
|
||||
# * this work for additional information regarding copyright ownership.
|
||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# * (the "License"); you may not use this file except in compliance with
|
||||
# * the License. You may obtain a copy of the License at
|
||||
# *
|
||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||
# *
|
||||
# * Unless required by applicable law or agreed to in writing, software
|
||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
||||
|
||||
ram.flush.mb=flush:32:32
|
||||
compound=cmpnd:true:false
|
||||
|
||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||
directory=FSDirectory
|
||||
|
||||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=true
|
||||
doc.term.vector.offsets=true
|
||||
doc.term.vector.positions=true
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
||||
# task at this depth or less would print when they start
|
||||
task.max.depth.log=2
|
||||
|
||||
log.queries=true
|
||||
# -------------------------------------------------------------------------------------
|
||||
{ "Populate"
|
||||
CreateIndex
|
||||
{ "MAddDocs" AddDoc } : 20000
|
||||
ForceMerge(1)
|
||||
CloseIndex
|
||||
}
|
||||
{ "Rounds"
|
||||
|
||||
ResetSystemSoft
|
||||
OpenReader
|
||||
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
|
||||
CloseReader
|
||||
OpenReader
|
||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
|
||||
|
||||
CloseReader
|
||||
|
||||
RepSumByPref SearchHlgtSameRdr
|
||||
|
||||
NewRound
|
||||
|
||||
} : 2
|
||||
|
||||
RepSumByNameRound
|
||||
RepSumByName
|
||||
RepSumByPrefRound MAddDocs
|
|
@ -349,6 +349,8 @@ public class PerfRunData implements Closeable {
|
|||
// Hold reference to new IR
|
||||
indexReader.incRef();
|
||||
indexSearcher = new IndexSearcher(indexReader);
|
||||
// TODO Some day we should make the query cache in this module configurable and control clearing the cache
|
||||
indexSearcher.setQueryCache(null);
|
||||
} else {
|
||||
indexSearcher = null;
|
||||
}
|
||||
|
|
|
@ -43,6 +43,7 @@ import org.apache.lucene.document.FieldType;
|
|||
import org.apache.lucene.document.LongPoint;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
|
||||
/**
|
||||
* Creates {@link Document} objects. Uses a {@link ContentSource} to generate
|
||||
|
@ -58,6 +59,8 @@ import org.apache.lucene.document.TextField;
|
|||
* (default <b>true</b>).
|
||||
* <li><b>doc.body.tokenized</b> - specifies whether the
|
||||
* body field should be tokenized (default = <b>doc.tokenized</b>).
|
||||
* <li><b>doc.body.offsets</b> - specifies whether to add offsets into the postings index
|
||||
* for the body field. It is useful for highlighting. (default <b>false</b>)
|
||||
* <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in
|
||||
* the index or not. (default <b>false</b>).
|
||||
* <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be
|
||||
|
@ -424,6 +427,7 @@ public class DocMaker implements Closeable {
|
|||
boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
|
||||
boolean norms = config.get("doc.tokenized.norms", false);
|
||||
boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
|
||||
boolean bodyOffsets = config.get("doc.body.offsets", false);
|
||||
boolean termVec = config.get("doc.term.vector", false);
|
||||
boolean termVecPositions = config.get("doc.term.vector.positions", false);
|
||||
boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
|
||||
|
@ -441,6 +445,9 @@ public class DocMaker implements Closeable {
|
|||
bodyValType.setStored(bodyStored);
|
||||
bodyValType.setTokenized(bodyTokenized);
|
||||
bodyValType.setOmitNorms(!bodyNorms);
|
||||
if (bodyTokenized && bodyOffsets) {
|
||||
bodyValType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
}
|
||||
bodyValType.setStoreTermVectors(termVec);
|
||||
bodyValType.setStoreTermVectorPositions(termVecPositions);
|
||||
bodyValType.setStoreTermVectorOffsets(termVecOffsets);
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
/**
|
||||
* Abstract class for benchmarking highlighting performance
|
||||
*/
|
||||
public abstract class BenchmarkHighlighter {
|
||||
public abstract int doHighlight( IndexReader reader, int doc, String field,
|
||||
Document document, Analyzer analyzer, String text ) throws Exception ;
|
||||
}
|
|
@ -75,7 +75,7 @@ public abstract class ReadTask extends PerfTask {
|
|||
int res = 0;
|
||||
|
||||
// open reader or use existing one
|
||||
IndexSearcher searcher = getRunData().getIndexSearcher();
|
||||
IndexSearcher searcher = getRunData().getIndexSearcher(); // (will incRef the reader)
|
||||
|
||||
IndexReader reader;
|
||||
|
||||
|
@ -132,46 +132,20 @@ public abstract class ReadTask extends PerfTask {
|
|||
//hits = collector.topDocs();
|
||||
}
|
||||
|
||||
final String printHitsField = getRunData().getConfig().get("print.hits.field", null);
|
||||
if (hits != null && printHitsField != null && printHitsField.length() > 0) {
|
||||
System.out.println("totalHits = " + hits.totalHits);
|
||||
System.out.println("maxDoc() = " + reader.maxDoc());
|
||||
System.out.println("numDocs() = " + reader.numDocs());
|
||||
for(int i=0;i<hits.scoreDocs.length;i++) {
|
||||
final int docID = hits.scoreDocs[i].doc;
|
||||
final Document doc = reader.document(docID);
|
||||
System.out.println(" " + i + ": doc=" + docID + " score=" + hits.scoreDocs[i].score + " " + printHitsField + " =" + doc.get(printHitsField));
|
||||
}
|
||||
}
|
||||
|
||||
if (withTraverse()) {
|
||||
final ScoreDoc[] scoreDocs = hits.scoreDocs;
|
||||
int traversalSize = Math.min(scoreDocs.length, traversalSize());
|
||||
|
||||
if (traversalSize > 0) {
|
||||
boolean retrieve = withRetrieve();
|
||||
int numHighlight = Math.min(numToHighlight(), scoreDocs.length);
|
||||
Analyzer analyzer = getRunData().getAnalyzer();
|
||||
BenchmarkHighlighter highlighter = null;
|
||||
if (numHighlight > 0) {
|
||||
highlighter = getBenchmarkHighlighter(q);
|
||||
}
|
||||
for (int m = 0; m < traversalSize; m++) {
|
||||
int id = scoreDocs[m].doc;
|
||||
res++;
|
||||
if (retrieve) {
|
||||
Document document = retrieveDoc(reader, id);
|
||||
res += document != null ? 1 : 0;
|
||||
if (numHighlight > 0 && m < numHighlight) {
|
||||
Collection<String> fieldsToHighlight = getFieldsToHighlight(document);
|
||||
for (final String field : fieldsToHighlight) {
|
||||
String text = document.get(field);
|
||||
res += highlighter.doHighlight(reader, id, field, document, analyzer, text);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (hits != null) {
|
||||
final String printHitsField = getRunData().getConfig().get("print.hits.field", null);
|
||||
if (printHitsField != null && printHitsField.length() > 0) {
|
||||
System.out.println("totalHits = " + hits.totalHits);
|
||||
System.out.println("maxDoc() = " + reader.maxDoc());
|
||||
System.out.println("numDocs() = " + reader.numDocs());
|
||||
for(int i=0;i<hits.scoreDocs.length;i++) {
|
||||
final int docID = hits.scoreDocs[i].doc;
|
||||
final Document doc = reader.document(docID);
|
||||
System.out.println(" " + i + ": doc=" + docID + " score=" + hits.scoreDocs[i].score + " " + printHitsField + " =" + doc.get(printHitsField));
|
||||
}
|
||||
}
|
||||
|
||||
res += withTopDocs(searcher, q, hits);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -185,6 +159,28 @@ public abstract class ReadTask extends PerfTask {
|
|||
return res;
|
||||
}
|
||||
|
||||
protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||
IndexReader reader = searcher.getIndexReader();
|
||||
int res = 0;
|
||||
if (withTraverse()) {
|
||||
final ScoreDoc[] scoreDocs = hits.scoreDocs;
|
||||
int traversalSize = Math.min(scoreDocs.length, traversalSize());
|
||||
|
||||
if (traversalSize > 0) {
|
||||
boolean retrieve = withRetrieve();
|
||||
for (int m = 0; m < traversalSize; m++) {
|
||||
int id = scoreDocs[m].doc;
|
||||
res++;
|
||||
if (retrieve) {
|
||||
Document document = retrieveDoc(reader, id);
|
||||
res += document != null ? 1 : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected Collector createCollector() throws Exception {
|
||||
return TopScoreDocCollector.create(numHits());
|
||||
}
|
||||
|
@ -267,39 +263,8 @@ public abstract class ReadTask extends PerfTask {
|
|||
*/
|
||||
public abstract boolean withRetrieve();
|
||||
|
||||
/**
|
||||
* Set to the number of documents to highlight.
|
||||
*
|
||||
* @return The number of the results to highlight. O means no docs will be highlighted.
|
||||
*/
|
||||
public int numToHighlight() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an appropriate highlighter to be used with
|
||||
* highlighting tasks
|
||||
*/
|
||||
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
|
||||
return null;
|
||||
}
|
||||
|
||||
protected Sort getSort() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Define the fields to highlight. Base implementation returns all fields
|
||||
* @param document The Document
|
||||
* @return A Collection of Field names (Strings)
|
||||
*/
|
||||
protected Collection<String> getFieldsToHighlight(Document document) {
|
||||
List<IndexableField> fields = document.getFields();
|
||||
Set<String> result = new HashSet<>(fields.size());
|
||||
for (final IndexableField f : fields) {
|
||||
result.add(f.name());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -14,65 +14,98 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
|
||||
import java.util.Collection;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.highlight.DefaultEncoder;
|
||||
import org.apache.lucene.search.highlight.Encoder;
|
||||
import org.apache.lucene.search.highlight.Highlighter;
|
||||
import org.apache.lucene.search.highlight.QueryScorer;
|
||||
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||
import org.apache.lucene.search.highlight.TextFragment;
|
||||
import org.apache.lucene.search.highlight.TokenSources;
|
||||
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
|
||||
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
|
||||
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
|
||||
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldQuery;
|
||||
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
|
||||
import org.apache.lucene.search.vectorhighlight.WeightedFragListBuilder;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/**
|
||||
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
|
||||
*
|
||||
* Uses the {@link org.apache.lucene.search.highlight.SimpleHTMLFormatter} for formatting.
|
||||
*
|
||||
* <p>Note: This task reuses the reader if it is already open.
|
||||
* Otherwise a reader is opened at start and closed at the end.
|
||||
* </p>
|
||||
*
|
||||
* <p>Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]</p>
|
||||
* <p>Takes optional multivalued, comma separated param string as: type[<enum>],maxFrags[<int>],fields[name1;name2;...]</p>
|
||||
* <ul>
|
||||
* <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
|
||||
* <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
|
||||
* <li>type - the highlighter implementation, e.g. "UH"</li>
|
||||
* <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
|
||||
* <li>mergeContiguous - true if contiguous fragments should be merged.</li>
|
||||
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
|
||||
* </ul>
|
||||
* Example:
|
||||
* <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
|
||||
* <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(type[UH],maxFrags[3],fields[body]) > : 1000
|
||||
* </pre>
|
||||
*
|
||||
* Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well.
|
||||
* Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well,
|
||||
* and offsets in postings is another option.
|
||||
*
|
||||
* <p>Other side effects: counts additional 1 (record) for each traversed hit,
|
||||
* and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
|
||||
*/
|
||||
public class SearchTravRetHighlightTask extends SearchTravTask {
|
||||
|
||||
protected int numToHighlight = Integer.MAX_VALUE;
|
||||
protected boolean mergeContiguous;
|
||||
protected int maxFrags = 2;
|
||||
protected Set<String> paramFields = Collections.emptySet();
|
||||
protected Highlighter highlighter;
|
||||
protected int maxDocCharsToAnalyze;
|
||||
private int maxDocCharsToAnalyze; // max leading content chars to highlight
|
||||
private int maxFrags = 1; // aka passages
|
||||
private Set<String> hlFields = Collections.singleton("body");
|
||||
private String type;
|
||||
private HLImpl hlImpl;
|
||||
private Analyzer analyzer;
|
||||
|
||||
public SearchTravRetHighlightTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setParams(String params) {
|
||||
// can't call super because super doesn't understand our params syntax
|
||||
this.params = params;
|
||||
// TODO consider instead using data.getConfig().get("highlighter.*")?
|
||||
String[] splits = params.split(",");
|
||||
for (String split : splits) {
|
||||
if (split.startsWith("type[") == true) {
|
||||
type = split.substring("type[".length(), split.length() - 1);
|
||||
} else if (split.startsWith("maxFrags[") == true) {
|
||||
maxFrags = (int) Float.parseFloat(split.substring("maxFrags[".length(), split.length() - 1));
|
||||
} else if (split.startsWith("fields[") == true) {
|
||||
String fieldNames = split.substring("fields[".length(), split.length() - 1);
|
||||
String[] fieldSplits = fieldNames.split(";");
|
||||
hlFields = new HashSet<>(Arrays.asList(fieldSplits));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
|
@ -82,72 +115,188 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
|
|||
throw new Exception("doc.stored must be set to true");
|
||||
}
|
||||
maxDocCharsToAnalyze = data.getConfig().get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean withRetrieve() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numToHighlight() {
|
||||
return numToHighlight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
|
||||
highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
|
||||
highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
|
||||
return new BenchmarkHighlighter(){
|
||||
@Override
|
||||
public int doHighlight(IndexReader reader, int doc, String field,
|
||||
Document document, Analyzer analyzer, String text) throws Exception {
|
||||
final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
|
||||
TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
|
||||
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
|
||||
return frag != null ? frag.length : 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> getFieldsToHighlight(Document document) {
|
||||
Collection<String> result = super.getFieldsToHighlight(document);
|
||||
//if stored is false, then result will be empty, in which case just get all the param fields
|
||||
if (paramFields.isEmpty() == false && result.isEmpty() == false) {
|
||||
result.retainAll(paramFields);
|
||||
} else {
|
||||
result = paramFields;
|
||||
analyzer = data.getAnalyzer();
|
||||
String type = this.type;
|
||||
if (type == null) {
|
||||
type = data.getConfig().get("highlighter", null);
|
||||
}
|
||||
switch (type) {
|
||||
case "NONE": hlImpl = new NoHLImpl(); break;
|
||||
case "SH_A": hlImpl = new StandardHLImpl(false); break;
|
||||
case "SH_V": hlImpl = new StandardHLImpl(true); break;
|
||||
|
||||
case "FVH_V": hlImpl = new FastVectorHLImpl(); break;
|
||||
|
||||
case "UH": hlImpl = new UnifiedHLImpl(null); break;
|
||||
case "UH_A": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.ANALYSIS); break;
|
||||
case "UH_V": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.TERM_VECTORS); break;
|
||||
case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break;
|
||||
case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break;
|
||||
|
||||
case "PH_P": hlImpl = new PostingsHLImpl(); break;
|
||||
|
||||
default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// here is where we intercept ReadTask's logic to do the highlighting, and nothing else (no retrieval of all field vals)
|
||||
@Override
|
||||
public void setParams(String params) {
|
||||
// can't call super because super doesn't understand our
|
||||
// params syntax
|
||||
this.params = params;
|
||||
String [] splits = params.split(",");
|
||||
for (int i = 0; i < splits.length; i++) {
|
||||
if (splits[i].startsWith("size[") == true){
|
||||
traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
|
||||
} else if (splits[i].startsWith("highlight[") == true){
|
||||
numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
|
||||
} else if (splits[i].startsWith("maxFrags[") == true){
|
||||
maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
|
||||
} else if (splits[i].startsWith("mergeContiguous[") == true){
|
||||
mergeContiguous = Boolean.valueOf(splits[i].substring("mergeContiguous[".length(),splits[i].length() - 1)).booleanValue();
|
||||
} else if (splits[i].startsWith("fields[") == true){
|
||||
paramFields = new HashSet<>();
|
||||
String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
|
||||
String [] fieldSplits = fieldNames.split(";");
|
||||
for (int j = 0; j < fieldSplits.length; j++) {
|
||||
paramFields.add(fieldSplits[j]);
|
||||
protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||
hlImpl.withTopDocs(searcher, q, hits);
|
||||
// note: it'd be nice if we knew the sum kilobytes of text across these hits so we could return that. It'd be a more
|
||||
// useful number to gauge the amount of work. But given "average" document sizes and lots of queries, returning the
|
||||
// number of docs is reasonable.
|
||||
return hits.scoreDocs.length; // always return # scored docs.
|
||||
}
|
||||
|
||||
private interface HLImpl {
|
||||
void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception;
|
||||
}
|
||||
|
||||
private volatile int preventOptimizeAway = 0;
|
||||
|
||||
private class StandardHLImpl implements HLImpl {
|
||||
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>");
|
||||
DefaultEncoder encoder = new DefaultEncoder();
|
||||
Highlighter highlighter = new Highlighter(formatter, encoder, null);
|
||||
boolean termVecs;
|
||||
|
||||
StandardHLImpl(boolean termVecs) {
|
||||
highlighter.setEncoder(new DefaultEncoder());
|
||||
highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
|
||||
this.termVecs = termVecs;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||
IndexReader reader = searcher.getIndexReader();
|
||||
highlighter.setFragmentScorer(new QueryScorer(q));
|
||||
// highlighter.setTextFragmenter(); unfortunately no sentence mechanism, not even regex. Default here is trivial
|
||||
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
|
||||
Document document = reader.document(scoreDoc.doc, hlFields);
|
||||
Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null;
|
||||
for (IndexableField indexableField : document) {
|
||||
TokenStream tokenStream;
|
||||
if (termVecs) {
|
||||
tokenStream = TokenSources.getTokenStream(indexableField.name(), tvFields,
|
||||
indexableField.stringValue(), analyzer, maxDocCharsToAnalyze);
|
||||
} else {
|
||||
tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue());
|
||||
}
|
||||
// will close TokenStream:
|
||||
String[] fragments = highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags);
|
||||
preventOptimizeAway = fragments.length;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class FastVectorHLImpl implements HLImpl {
|
||||
int fragSize = 100;
|
||||
WeightedFragListBuilder fragListBuilder = new WeightedFragListBuilder();
|
||||
BoundaryScanner bs = new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ENGLISH));
|
||||
ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(bs);
|
||||
String[] preTags = {"<em>"};
|
||||
String[] postTags = {"</em>"};
|
||||
Encoder encoder = new DefaultEncoder();// new SimpleHTMLEncoder();
|
||||
FastVectorHighlighter highlighter = new FastVectorHighlighter(
|
||||
true, // phraseHighlight
|
||||
false); // requireFieldMatch -- not pertinent to our benchmark
|
||||
|
||||
@Override
|
||||
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||
IndexReader reader = searcher.getIndexReader();
|
||||
final FieldQuery fq = highlighter.getFieldQuery( q, reader);
|
||||
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
|
||||
for (String hlField : hlFields) {
|
||||
String[] fragments = highlighter.getBestFragments(fq, reader, scoreDoc.doc, hlField, fragSize, maxFrags,
|
||||
fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
|
||||
preventOptimizeAway = fragments.length;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private ScoreDoc[] docIdOrder(ScoreDoc[] scoreDocs) {
|
||||
ScoreDoc[] clone = new ScoreDoc[scoreDocs.length];
|
||||
System.arraycopy(scoreDocs, 0, clone, 0, scoreDocs.length);
|
||||
ArrayUtil.introSort(clone, (a, b) -> Integer.compare(a.doc, b.doc));
|
||||
return clone;
|
||||
}
|
||||
|
||||
private class PostingsHLImpl implements HLImpl {
|
||||
PostingsHighlighter highlighter;
|
||||
String[] fields = hlFields.toArray(new String[hlFields.size()]);
|
||||
int[] maxPassages;
|
||||
PostingsHLImpl() {
|
||||
highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
}
|
||||
@Override
|
||||
protected BreakIterator getBreakIterator(String field) {
|
||||
return BreakIterator.getSentenceInstance(Locale.ENGLISH);
|
||||
}
|
||||
};
|
||||
maxPassages = new int[hlFields.size()];
|
||||
Arrays.fill(maxPassages, maxFrags);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||
Map<String, String[]> result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages);
|
||||
preventOptimizeAway = result.size();
|
||||
}
|
||||
}
|
||||
|
||||
private class UnifiedHLImpl implements HLImpl {
|
||||
UnifiedHighlighter highlighter;
|
||||
IndexSearcher lastSearcher;
|
||||
UnifiedHighlighter.OffsetSource offsetSource; // null means auto select
|
||||
String[] fields = hlFields.toArray(new String[hlFields.size()]);
|
||||
int[] maxPassages;
|
||||
|
||||
UnifiedHLImpl(final UnifiedHighlighter.OffsetSource offsetSource) {
|
||||
this.offsetSource = offsetSource;
|
||||
maxPassages = new int[hlFields.size()];
|
||||
Arrays.fill(maxPassages, maxFrags);
|
||||
}
|
||||
|
||||
private void reset(IndexSearcher searcher) {
|
||||
if (lastSearcher == searcher) {
|
||||
return;
|
||||
}
|
||||
lastSearcher = searcher;
|
||||
highlighter = new UnifiedHighlighter(searcher, analyzer) {
|
||||
@Override
|
||||
protected OffsetSource getOffsetSource(String field) {
|
||||
return offsetSource != null ? offsetSource : super.getOffsetSource(field);
|
||||
}
|
||||
};
|
||||
highlighter.setBreakIterator(() -> BreakIterator.getSentenceInstance(Locale.ENGLISH));
|
||||
highlighter.setMaxLength(maxDocCharsToAnalyze);
|
||||
highlighter.setHighlightPhrasesStrictly(true);
|
||||
highlighter.setHandleMultiTermQuery(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||
reset(searcher);
|
||||
Map<String, String[]> result = highlighter.highlightFields(fields, q, hits, maxPassages);
|
||||
preventOptimizeAway = result.size();
|
||||
}
|
||||
}
|
||||
|
||||
private class NoHLImpl implements HLImpl {
|
||||
|
||||
@Override
|
||||
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||
//just retrieve the HL fields
|
||||
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
|
||||
preventOptimizeAway += searcher.doc(scoreDoc.doc, hlFields).iterator().hasNext() ? 2 : 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,147 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldQuery;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Collections;
|
||||
|
||||
/**
|
||||
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents by using FastVectorHighlighter.
|
||||
*
|
||||
* <p>Note: This task reuses the reader if it is already open.
|
||||
* Otherwise a reader is opened at start and closed at the end.
|
||||
* </p>
|
||||
*
|
||||
* <p>Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]</p>
|
||||
* <ul>
|
||||
* <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
|
||||
* <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
|
||||
* <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
|
||||
* <li>fragSize - The length of fragments</li>
|
||||
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
|
||||
* </ul>
|
||||
* Example:
|
||||
* <pre>"SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(size[10],highlight[10],maxFrags[3],fields[body]) > : 1000
|
||||
* </pre>
|
||||
*
|
||||
* Fields must be stored and term vector offsets and positions in order must be true for this task to work.
|
||||
*
|
||||
* <p>Other side effects: counts additional 1 (record) for each traversed hit,
|
||||
* and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
|
||||
*/
|
||||
public class SearchTravRetVectorHighlightTask extends SearchTravTask {
|
||||
|
||||
protected int numToHighlight = Integer.MAX_VALUE;
|
||||
protected int maxFrags = 2;
|
||||
protected int fragSize = 100;
|
||||
protected Set<String> paramFields = Collections.emptySet();
|
||||
protected FastVectorHighlighter highlighter;
|
||||
|
||||
public SearchTravRetVectorHighlightTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
//check to make sure either the doc is being stored
|
||||
PerfRunData data = getRunData();
|
||||
if (data.getConfig().get("doc.stored", false) == false){
|
||||
throw new Exception("doc.stored must be set to true");
|
||||
}
|
||||
if (data.getConfig().get("doc.term.vector.offsets", false) == false){
|
||||
throw new Exception("doc.term.vector.offsets must be set to true");
|
||||
}
|
||||
if (data.getConfig().get("doc.term.vector.positions", false) == false){
|
||||
throw new Exception("doc.term.vector.positions must be set to true");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean withRetrieve() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int numToHighlight() {
|
||||
return numToHighlight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
|
||||
highlighter = new FastVectorHighlighter( false, false );
|
||||
final Query myq = q;
|
||||
return new BenchmarkHighlighter(){
|
||||
@Override
|
||||
public int doHighlight(IndexReader reader, int doc, String field,
|
||||
Document document, Analyzer analyzer, String text) throws Exception {
|
||||
final FieldQuery fq = highlighter.getFieldQuery( myq, reader);
|
||||
String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags);
|
||||
return fragments != null ? fragments.length : 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> getFieldsToHighlight(Document document) {
|
||||
Collection<String> result = super.getFieldsToHighlight(document);
|
||||
//if stored is false, then result will be empty, in which case just get all the param fields
|
||||
if (paramFields.isEmpty() == false && result.isEmpty() == false) {
|
||||
result.retainAll(paramFields);
|
||||
} else {
|
||||
result = paramFields;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setParams(String params) {
|
||||
// can't call super because super doesn't understand our
|
||||
// params syntax
|
||||
final String [] splits = params.split(",");
|
||||
for (int i = 0; i < splits.length; i++) {
|
||||
if (splits[i].startsWith("size[") == true){
|
||||
traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
|
||||
} else if (splits[i].startsWith("highlight[") == true){
|
||||
numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
|
||||
} else if (splits[i].startsWith("maxFrags[") == true){
|
||||
maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
|
||||
} else if (splits[i].startsWith("fragSize[") == true){
|
||||
fragSize = (int)Float.parseFloat(splits[i].substring("fragSize[".length(),splits[i].length() - 1));
|
||||
} else if (splits[i].startsWith("fields[") == true){
|
||||
paramFields = new HashSet<>();
|
||||
String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
|
||||
String [] fieldSplits = fieldNames.split(";");
|
||||
for (int j = 0; j < fieldSplits.length; j++) {
|
||||
paramFields.add(fieldSplits[j]);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -31,9 +31,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
|
||||
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||
import org.apache.lucene.collation.CollationKeyAnalyzer;
|
||||
|
@ -159,110 +157,6 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
|
|||
//assertTrue(CountingSearchTestTask.numSearches > 0);
|
||||
}
|
||||
|
||||
public void testHighlighting() throws Exception {
|
||||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"doc.stored=true",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
||||
"docs.file=" + getReuters20LinesFile(),
|
||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
||||
"ResetSystemErase",
|
||||
"CreateIndex",
|
||||
"{ AddDoc } : 100",
|
||||
"ForceMerge(1)",
|
||||
"CloseIndex",
|
||||
"OpenReader",
|
||||
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
|
||||
"CloseReader",
|
||||
};
|
||||
|
||||
// 2. we test this value later
|
||||
CountingHighlighterTestTask.numHighlightedResults = 0;
|
||||
CountingHighlighterTestTask.numDocsRetrieved = 0;
|
||||
// 3. execute the algorithm (required in every "logic" test)
|
||||
Benchmark benchmark = execBenchmark(algLines);
|
||||
|
||||
// 4. test specific checks after the benchmark run completed.
|
||||
assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
|
||||
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
|
||||
//we probably should use a different doc/query maker, but...
|
||||
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
|
||||
|
||||
assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
|
||||
// now we should be able to open the index for write.
|
||||
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
|
||||
iw.close();
|
||||
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
|
||||
assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs());
|
||||
ir.close();
|
||||
}
|
||||
|
||||
public void testHighlightingTV() throws Exception {
|
||||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"doc.stored=true",//doc storage is required in order to have text to highlight
|
||||
"doc.term.vector=true",
|
||||
"doc.term.vector.offsets=true",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
||||
"docs.file=" + getReuters20LinesFile(),
|
||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
||||
"ResetSystemErase",
|
||||
"CreateIndex",
|
||||
"{ AddDoc } : 1000",
|
||||
"ForceMerge(1)",
|
||||
"CloseIndex",
|
||||
"OpenReader",
|
||||
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
|
||||
"CloseReader",
|
||||
};
|
||||
|
||||
// 2. we test this value later
|
||||
CountingHighlighterTestTask.numHighlightedResults = 0;
|
||||
CountingHighlighterTestTask.numDocsRetrieved = 0;
|
||||
// 3. execute the algorithm (required in every "logic" test)
|
||||
Benchmark benchmark = execBenchmark(algLines);
|
||||
|
||||
// 4. test specific checks after the benchmark run completed.
|
||||
assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
|
||||
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
|
||||
//we probably should use a different doc/query maker, but...
|
||||
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
|
||||
|
||||
assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
|
||||
// now we should be able to open the index for write.
|
||||
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
|
||||
iw.close();
|
||||
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
|
||||
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
|
||||
ir.close();
|
||||
}
|
||||
|
||||
public void testHighlightingNoTvNoStore() throws Exception {
|
||||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"doc.stored=false",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
||||
"docs.file=" + getReuters20LinesFile(),
|
||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
||||
"ResetSystemErase",
|
||||
"CreateIndex",
|
||||
"{ AddDoc } : 1000",
|
||||
"ForceMerge(1)",
|
||||
"CloseIndex",
|
||||
"OpenReader",
|
||||
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
|
||||
"CloseReader",
|
||||
};
|
||||
|
||||
// 2. we test this value later
|
||||
CountingHighlighterTestTask.numHighlightedResults = 0;
|
||||
CountingHighlighterTestTask.numDocsRetrieved = 0;
|
||||
// 3. execute the algorithm (required in every "logic" test)
|
||||
expectThrows(Exception.class, () -> {
|
||||
execBenchmark(algLines);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Exhasting Doc Maker logic
|
||||
*/
|
||||
|
|
|
@ -1,68 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.highlight.Highlighter;
|
||||
import org.apache.lucene.search.highlight.QueryScorer;
|
||||
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||
import org.apache.lucene.search.highlight.TextFragment;
|
||||
import org.apache.lucene.search.highlight.TokenSources;
|
||||
|
||||
/**
|
||||
* Test Search task which counts number of searches.
|
||||
*/
|
||||
public class CountingHighlighterTestTask extends SearchTravRetHighlightTask {
|
||||
|
||||
public static int numHighlightedResults = 0;
|
||||
public static int numDocsRetrieved = 0;
|
||||
|
||||
public CountingHighlighterTestTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
|
||||
Document document = ir.document(id);
|
||||
if (document != null) {
|
||||
numDocsRetrieved++;
|
||||
}
|
||||
return document;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BenchmarkHighlighter getBenchmarkHighlighter(Query q) {
|
||||
highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
|
||||
return new BenchmarkHighlighter() {
|
||||
@Override
|
||||
public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception {
|
||||
final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
|
||||
TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
|
||||
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
|
||||
numHighlightedResults += frag != null ? frag.length : 0;
|
||||
return frag != null ? frag.length : 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue