mirror of https://github.com/apache/lucene.git
LUCENE-7438: Renovate benchmark module's support for highlighting
This commit is contained in:
parent
6aa28bd655
commit
5ef60af9c1
|
@ -192,6 +192,8 @@
|
||||||
// excludes:
|
// excludes:
|
||||||
exclude(name: '**/build/**')
|
exclude(name: '**/build/**')
|
||||||
exclude(name: '**/dist/**')
|
exclude(name: '**/dist/**')
|
||||||
|
exclude(name: 'lucene/benchmark/work/**')
|
||||||
|
exclude(name: 'lucene/benchmark/temp/**')
|
||||||
exclude(name: '**/CheckLoggingConfiguration.java')
|
exclude(name: '**/CheckLoggingConfiguration.java')
|
||||||
exclude(name: 'build.xml') // ourselves :-)
|
exclude(name: 'build.xml') // ourselves :-)
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,6 +76,9 @@ Other
|
||||||
* LUCENE-7452: Block join query exception suggests how to find a doc, which
|
* LUCENE-7452: Block join query exception suggests how to find a doc, which
|
||||||
violates orthogonality requirement. (Mikhail Khludnev)
|
violates orthogonality requirement. (Mikhail Khludnev)
|
||||||
|
|
||||||
|
* LUCENE-7438: Renovate the Benchmark module's support for benchmarking highlighting. All
|
||||||
|
highlighters are supported via SearchTravRetHighlight. (David Smiley)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-7292: Fix build to use "--release 8" instead of "-release 8" on
|
* LUCENE-7292: Fix build to use "--release 8" instead of "-release 8" on
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
temp/
|
/temp
|
||||||
work/
|
/work
|
|
@ -13,10 +13,13 @@ writing, there is a page file in
|
||||||
http://download.wikimedia.org/enwiki/20070402/. You can download this
|
http://download.wikimedia.org/enwiki/20070402/. You can download this
|
||||||
file manually and put it in temp. Note that the file you download will
|
file manually and put it in temp. Note that the file you download will
|
||||||
probably have the date in the name, e.g.,
|
probably have the date in the name, e.g.,
|
||||||
http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When
|
http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2.
|
||||||
you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2.
|
|
||||||
|
If you use the EnwikiContentSource then the data will be decompressed on the fly
|
||||||
|
during the benchmark. If you want to benchmark indexing, you should probably decompress
|
||||||
|
it beforehand using the "enwiki" Ant target which will produce a work/enwiki.txt, after
|
||||||
|
which you can use LineDocSource in your benchmark.
|
||||||
|
|
||||||
After that, ant enwiki should process the data set and run a load
|
After that, ant enwiki should process the data set and run a load
|
||||||
test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can
|
test. Ant target enwiki will download, decompress, and extract (to individual files
|
||||||
also be used to download, decompress, and extract (to individual files
|
|
||||||
in work/enwiki) the dataset, respectively.
|
in work/enwiki) the dataset, respectively.
|
||||||
|
|
|
@ -1,80 +0,0 @@
|
||||||
#/**
|
|
||||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
# * contributor license agreements. See the NOTICE file distributed with
|
|
||||||
# * this work for additional information regarding copyright ownership.
|
|
||||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
# * (the "License"); you may not use this file except in compliance with
|
|
||||||
# * the License. You may obtain a copy of the License at
|
|
||||||
# *
|
|
||||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
# *
|
|
||||||
# * Unless required by applicable law or agreed to in writing, software
|
|
||||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# * See the License for the specific language governing permissions and
|
|
||||||
# * limitations under the License.
|
|
||||||
# */
|
|
||||||
# -------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
ram.flush.mb=flush:32:32
|
|
||||||
compound=cmpnd:true:false
|
|
||||||
|
|
||||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
|
||||||
directory=FSDirectory
|
|
||||||
|
|
||||||
doc.stored=true
|
|
||||||
doc.tokenized=true
|
|
||||||
doc.term.vector=true
|
|
||||||
doc.term.vector.offsets=true
|
|
||||||
doc.term.vector.positions=true
|
|
||||||
log.step=2000
|
|
||||||
|
|
||||||
docs.dir=reuters-out
|
|
||||||
|
|
||||||
content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
|
|
||||||
docs.file=temp/enwiki-20070527-pages-articles.xml
|
|
||||||
|
|
||||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiQueryMaker
|
|
||||||
enwikiQueryMaker.disableSpanQueries=true
|
|
||||||
|
|
||||||
max.field.length=2147483647
|
|
||||||
highlighter.maxDocCharsToAnalyze=2147483647
|
|
||||||
|
|
||||||
# task at this depth or less would print when they start
|
|
||||||
task.max.depth.log=2
|
|
||||||
|
|
||||||
log.queries=true
|
|
||||||
# -------------------------------------------------------------------------------------
|
|
||||||
{ "Populate"
|
|
||||||
CreateIndex
|
|
||||||
{ "MAddDocs" AddDoc } : 20000
|
|
||||||
ForceMerge(1)
|
|
||||||
CloseIndex
|
|
||||||
}
|
|
||||||
{
|
|
||||||
OpenReader
|
|
||||||
{ "WarmTV" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 100
|
|
||||||
CloseReader
|
|
||||||
}
|
|
||||||
{
|
|
||||||
"Rounds"
|
|
||||||
|
|
||||||
ResetSystemSoft
|
|
||||||
|
|
||||||
OpenReader
|
|
||||||
{ "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 200
|
|
||||||
CloseReader
|
|
||||||
|
|
||||||
ResetSystemSoft
|
|
||||||
|
|
||||||
OpenReader
|
|
||||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[3],fields[body]) > : 200
|
|
||||||
CloseReader
|
|
||||||
|
|
||||||
RepSumByPref Search
|
|
||||||
|
|
||||||
NewRound
|
|
||||||
} : 4
|
|
||||||
|
|
||||||
RepSumByNameRound
|
|
||||||
RepSumByName
|
|
|
@ -14,55 +14,52 @@
|
||||||
# * See the License for the specific language governing permissions and
|
# * See the License for the specific language governing permissions and
|
||||||
# * limitations under the License.
|
# * limitations under the License.
|
||||||
# */
|
# */
|
||||||
# -------------------------------------------------------------------------------------
|
|
||||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
|
||||||
|
|
||||||
ram.flush.mb=flush:32:32
|
# For postings-offsets with light term-vectors
|
||||||
compound=cmpnd:true:false
|
|
||||||
|
|
||||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||||
directory=FSDirectory
|
directory=FSDirectory
|
||||||
|
work.dir=work/enwikiPostings
|
||||||
|
ram.flush.mb=64
|
||||||
|
compound=false
|
||||||
|
|
||||||
doc.stored=true
|
doc.stored=true
|
||||||
doc.tokenized=true
|
doc.tokenized=true
|
||||||
|
# offsets in postings:
|
||||||
|
doc.body.offsets=true
|
||||||
|
# term vector, but no positions/offsets with it
|
||||||
doc.term.vector=true
|
doc.term.vector=true
|
||||||
doc.term.vector.offsets=true
|
|
||||||
doc.term.vector.positions=true
|
|
||||||
log.step=2000
|
|
||||||
|
|
||||||
docs.dir=reuters-out
|
content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
|
||||||
|
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
|
||||||
|
|
||||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
|
||||||
|
file.query.maker.file=conf/query-phrases.txt
|
||||||
|
log.queries=false
|
||||||
|
log.step.SearchTravRetHighlight=-1
|
||||||
|
|
||||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
|
||||||
|
|
||||||
# task at this depth or less would print when they start
|
|
||||||
task.max.depth.log=2
|
|
||||||
|
|
||||||
log.queries=true
|
|
||||||
# -------------------------------------------------------------------------------------
|
|
||||||
{ "Populate"
|
{ "Populate"
|
||||||
CreateIndex
|
CreateIndex
|
||||||
{ "MAddDocs" AddDoc } : 20000
|
[{ "MAddDocs" AddDoc > : 50000] : 4
|
||||||
ForceMerge(1)
|
|
||||||
CloseIndex
|
CloseIndex
|
||||||
}
|
} : 0
|
||||||
{ "Rounds"
|
|
||||||
|
{
|
||||||
|
"Rounds"
|
||||||
|
|
||||||
ResetSystemSoft
|
ResetSystemSoft
|
||||||
|
|
||||||
|
|
||||||
OpenReader
|
OpenReader
|
||||||
{ "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000
|
|
||||||
|
{ "Warm" SearchTravRetHighlight > : 1000
|
||||||
|
|
||||||
|
{ "HL" SearchTravRetHighlight > : 500
|
||||||
|
|
||||||
CloseReader
|
CloseReader
|
||||||
|
|
||||||
RepSumByPref MAddDocs
|
|
||||||
|
|
||||||
NewRound
|
NewRound
|
||||||
|
} : 6
|
||||||
|
|
||||||
} : 4
|
RepSumByPrefRound HL
|
||||||
|
|
||||||
RepSumByNameRound
|
|
||||||
RepSumByName
|
|
||||||
RepSumByPrefRound MAddDocs
|
|
|
@ -14,55 +14,51 @@
|
||||||
# * See the License for the specific language governing permissions and
|
# * See the License for the specific language governing permissions and
|
||||||
# * limitations under the License.
|
# * limitations under the License.
|
||||||
# */
|
# */
|
||||||
# -------------------------------------------------------------------------------------
|
|
||||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
|
||||||
|
|
||||||
ram.flush.mb=flush:32:32
|
# This is a full-term vector configuration.
|
||||||
compound=cmpnd:true:false
|
|
||||||
|
|
||||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
||||||
directory=FSDirectory
|
directory=FSDirectory
|
||||||
|
work.dir=work/enwikiTermVec
|
||||||
|
ram.flush.mb=64
|
||||||
|
compound=false
|
||||||
|
|
||||||
doc.stored=true
|
doc.stored=true
|
||||||
doc.tokenized=true
|
doc.tokenized=true
|
||||||
doc.term.vector=true
|
doc.term.vector=true
|
||||||
doc.term.vector.offsets=true
|
|
||||||
doc.term.vector.positions=true
|
doc.term.vector.positions=true
|
||||||
log.step=2000
|
doc.term.vector.offsets=true
|
||||||
|
|
||||||
docs.dir=reuters-out
|
content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
|
||||||
|
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
|
||||||
|
|
||||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
|
||||||
|
file.query.maker.file=conf/query-terms.txt
|
||||||
|
log.queries=false
|
||||||
|
log.step.SearchTravRetHighlight=-1
|
||||||
|
|
||||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V
|
||||||
|
|
||||||
# task at this depth or less would print when they start
|
|
||||||
task.max.depth.log=2
|
|
||||||
|
|
||||||
log.queries=true
|
|
||||||
# -------------------------------------------------------------------------------------
|
|
||||||
{ "Populate"
|
{ "Populate"
|
||||||
CreateIndex
|
CreateIndex
|
||||||
{ "MAddDocs" AddDoc } : 20000
|
[{ "MAddDocs" AddDoc > : 50000] : 4
|
||||||
ForceMerge(1)
|
|
||||||
CloseIndex
|
CloseIndex
|
||||||
}
|
} : 0
|
||||||
{ "Rounds"
|
|
||||||
|
{
|
||||||
|
"Rounds"
|
||||||
|
|
||||||
ResetSystemSoft
|
ResetSystemSoft
|
||||||
|
|
||||||
|
|
||||||
OpenReader
|
OpenReader
|
||||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000
|
|
||||||
|
{ "Warm" SearchTravRetHighlight > : 1000
|
||||||
|
|
||||||
|
{ "HL" SearchTravRetHighlight > : 500
|
||||||
|
|
||||||
CloseReader
|
CloseReader
|
||||||
|
|
||||||
RepSumByPref MAddDocs
|
|
||||||
|
|
||||||
NewRound
|
NewRound
|
||||||
|
|
||||||
} : 4
|
} : 4
|
||||||
|
|
||||||
RepSumByNameRound
|
RepSumByPrefRound HL
|
||||||
RepSumByName
|
|
||||||
RepSumByPrefRound MAddDocs
|
|
|
@ -54,7 +54,7 @@ log.queries=true
|
||||||
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
|
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
|
||||||
CloseReader
|
CloseReader
|
||||||
OpenReader
|
OpenReader
|
||||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
|
{ "SearchHlgtSameRdr" SearchTravRetHighlight(type[UH]) > : 1000
|
||||||
|
|
||||||
CloseReader
|
CloseReader
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
"Abraham Lincoln"
|
||||||
|
"Union Wisconsin"
|
||||||
|
"court of law"
|
||||||
|
"Field Theory" OR "Set Theory"
|
||||||
|
"Top 100"
|
||||||
|
"red hot chili"
|
||||||
|
"greatest guitarists"
|
||||||
|
"Planes, Trains & Automobiles" OR ships
|
||||||
|
"international airport"
|
||||||
|
"Xbox 360"
|
|
@ -0,0 +1,10 @@
|
||||||
|
Abraham AND Lincoln
|
||||||
|
Union AND Wisconsin
|
||||||
|
court AND law
|
||||||
|
top AND 100
|
||||||
|
(field OR set) AND theory
|
||||||
|
red AND hot AND chili
|
||||||
|
greatest AND guitarists
|
||||||
|
(planes AND trains AND automobiles) OR ships
|
||||||
|
international AND airport
|
||||||
|
xbox AND 360
|
|
@ -0,0 +1,7 @@
|
||||||
|
abrah* AND linc*
|
||||||
|
court* AND law*
|
||||||
|
(field OR set) AND theor*
|
||||||
|
red AND hot AND chili*
|
||||||
|
great* AND guitar*
|
||||||
|
(plan* AND train* AND automob*) OR ship*
|
||||||
|
international AND airport*
|
|
@ -1,69 +0,0 @@
|
||||||
#/**
|
|
||||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
# * contributor license agreements. See the NOTICE file distributed with
|
|
||||||
# * this work for additional information regarding copyright ownership.
|
|
||||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
# * (the "License"); you may not use this file except in compliance with
|
|
||||||
# * the License. You may obtain a copy of the License at
|
|
||||||
# *
|
|
||||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
# *
|
|
||||||
# * Unless required by applicable law or agreed to in writing, software
|
|
||||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# * See the License for the specific language governing permissions and
|
|
||||||
# * limitations under the License.
|
|
||||||
# */
|
|
||||||
# -------------------------------------------------------------------------------------
|
|
||||||
# multi val params are iterated by NewRound's, added to reports, start with column name.
|
|
||||||
|
|
||||||
ram.flush.mb=flush:32:32
|
|
||||||
compound=cmpnd:true:false
|
|
||||||
|
|
||||||
analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
|
|
||||||
directory=FSDirectory
|
|
||||||
|
|
||||||
doc.stored=true
|
|
||||||
doc.tokenized=true
|
|
||||||
doc.term.vector=true
|
|
||||||
doc.term.vector.offsets=true
|
|
||||||
doc.term.vector.positions=true
|
|
||||||
log.step=2000
|
|
||||||
|
|
||||||
docs.dir=reuters-out
|
|
||||||
|
|
||||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
|
||||||
|
|
||||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
|
||||||
|
|
||||||
# task at this depth or less would print when they start
|
|
||||||
task.max.depth.log=2
|
|
||||||
|
|
||||||
log.queries=true
|
|
||||||
# -------------------------------------------------------------------------------------
|
|
||||||
{ "Populate"
|
|
||||||
CreateIndex
|
|
||||||
{ "MAddDocs" AddDoc } : 20000
|
|
||||||
ForceMerge(1)
|
|
||||||
CloseIndex
|
|
||||||
}
|
|
||||||
{ "Rounds"
|
|
||||||
|
|
||||||
ResetSystemSoft
|
|
||||||
OpenReader
|
|
||||||
{ "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
|
|
||||||
CloseReader
|
|
||||||
OpenReader
|
|
||||||
{ "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
|
|
||||||
|
|
||||||
CloseReader
|
|
||||||
|
|
||||||
RepSumByPref SearchHlgtSameRdr
|
|
||||||
|
|
||||||
NewRound
|
|
||||||
|
|
||||||
} : 2
|
|
||||||
|
|
||||||
RepSumByNameRound
|
|
||||||
RepSumByName
|
|
||||||
RepSumByPrefRound MAddDocs
|
|
|
@ -349,6 +349,8 @@ public class PerfRunData implements Closeable {
|
||||||
// Hold reference to new IR
|
// Hold reference to new IR
|
||||||
indexReader.incRef();
|
indexReader.incRef();
|
||||||
indexSearcher = new IndexSearcher(indexReader);
|
indexSearcher = new IndexSearcher(indexReader);
|
||||||
|
// TODO Some day we should make the query cache in this module configurable and control clearing the cache
|
||||||
|
indexSearcher.setQueryCache(null);
|
||||||
} else {
|
} else {
|
||||||
indexSearcher = null;
|
indexSearcher = null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,6 +43,7 @@ import org.apache.lucene.document.FieldType;
|
||||||
import org.apache.lucene.document.LongPoint;
|
import org.apache.lucene.document.LongPoint;
|
||||||
import org.apache.lucene.document.StringField;
|
import org.apache.lucene.document.StringField;
|
||||||
import org.apache.lucene.document.TextField;
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.index.IndexOptions;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link Document} objects. Uses a {@link ContentSource} to generate
|
* Creates {@link Document} objects. Uses a {@link ContentSource} to generate
|
||||||
|
@ -58,6 +59,8 @@ import org.apache.lucene.document.TextField;
|
||||||
* (default <b>true</b>).
|
* (default <b>true</b>).
|
||||||
* <li><b>doc.body.tokenized</b> - specifies whether the
|
* <li><b>doc.body.tokenized</b> - specifies whether the
|
||||||
* body field should be tokenized (default = <b>doc.tokenized</b>).
|
* body field should be tokenized (default = <b>doc.tokenized</b>).
|
||||||
|
* <li><b>doc.body.offsets</b> - specifies whether to add offsets into the postings index
|
||||||
|
* for the body field. It is useful for highlighting. (default <b>false</b>)
|
||||||
* <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in
|
* <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in
|
||||||
* the index or not. (default <b>false</b>).
|
* the index or not. (default <b>false</b>).
|
||||||
* <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be
|
* <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be
|
||||||
|
@ -424,6 +427,7 @@ public class DocMaker implements Closeable {
|
||||||
boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
|
boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
|
||||||
boolean norms = config.get("doc.tokenized.norms", false);
|
boolean norms = config.get("doc.tokenized.norms", false);
|
||||||
boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
|
boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
|
||||||
|
boolean bodyOffsets = config.get("doc.body.offsets", false);
|
||||||
boolean termVec = config.get("doc.term.vector", false);
|
boolean termVec = config.get("doc.term.vector", false);
|
||||||
boolean termVecPositions = config.get("doc.term.vector.positions", false);
|
boolean termVecPositions = config.get("doc.term.vector.positions", false);
|
||||||
boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
|
boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
|
||||||
|
@ -441,6 +445,9 @@ public class DocMaker implements Closeable {
|
||||||
bodyValType.setStored(bodyStored);
|
bodyValType.setStored(bodyStored);
|
||||||
bodyValType.setTokenized(bodyTokenized);
|
bodyValType.setTokenized(bodyTokenized);
|
||||||
bodyValType.setOmitNorms(!bodyNorms);
|
bodyValType.setOmitNorms(!bodyNorms);
|
||||||
|
if (bodyTokenized && bodyOffsets) {
|
||||||
|
bodyValType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||||
|
}
|
||||||
bodyValType.setStoreTermVectors(termVec);
|
bodyValType.setStoreTermVectors(termVec);
|
||||||
bodyValType.setStoreTermVectorPositions(termVecPositions);
|
bodyValType.setStoreTermVectorPositions(termVecPositions);
|
||||||
bodyValType.setStoreTermVectorOffsets(termVecOffsets);
|
bodyValType.setStoreTermVectorOffsets(termVecOffsets);
|
||||||
|
|
|
@ -1,30 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.benchmark.byTask.tasks;
|
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Abstract class for benchmarking highlighting performance
|
|
||||||
*/
|
|
||||||
public abstract class BenchmarkHighlighter {
|
|
||||||
public abstract int doHighlight( IndexReader reader, int doc, String field,
|
|
||||||
Document document, Analyzer analyzer, String text ) throws Exception ;
|
|
||||||
}
|
|
|
@ -75,7 +75,7 @@ public abstract class ReadTask extends PerfTask {
|
||||||
int res = 0;
|
int res = 0;
|
||||||
|
|
||||||
// open reader or use existing one
|
// open reader or use existing one
|
||||||
IndexSearcher searcher = getRunData().getIndexSearcher();
|
IndexSearcher searcher = getRunData().getIndexSearcher(); // (will incRef the reader)
|
||||||
|
|
||||||
IndexReader reader;
|
IndexReader reader;
|
||||||
|
|
||||||
|
@ -132,8 +132,9 @@ public abstract class ReadTask extends PerfTask {
|
||||||
//hits = collector.topDocs();
|
//hits = collector.topDocs();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (hits != null) {
|
||||||
final String printHitsField = getRunData().getConfig().get("print.hits.field", null);
|
final String printHitsField = getRunData().getConfig().get("print.hits.field", null);
|
||||||
if (hits != null && printHitsField != null && printHitsField.length() > 0) {
|
if (printHitsField != null && printHitsField.length() > 0) {
|
||||||
System.out.println("totalHits = " + hits.totalHits);
|
System.out.println("totalHits = " + hits.totalHits);
|
||||||
System.out.println("maxDoc() = " + reader.maxDoc());
|
System.out.println("maxDoc() = " + reader.maxDoc());
|
||||||
System.out.println("numDocs() = " + reader.numDocs());
|
System.out.println("numDocs() = " + reader.numDocs());
|
||||||
|
@ -144,34 +145,7 @@ public abstract class ReadTask extends PerfTask {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (withTraverse()) {
|
res += withTopDocs(searcher, q, hits);
|
||||||
final ScoreDoc[] scoreDocs = hits.scoreDocs;
|
|
||||||
int traversalSize = Math.min(scoreDocs.length, traversalSize());
|
|
||||||
|
|
||||||
if (traversalSize > 0) {
|
|
||||||
boolean retrieve = withRetrieve();
|
|
||||||
int numHighlight = Math.min(numToHighlight(), scoreDocs.length);
|
|
||||||
Analyzer analyzer = getRunData().getAnalyzer();
|
|
||||||
BenchmarkHighlighter highlighter = null;
|
|
||||||
if (numHighlight > 0) {
|
|
||||||
highlighter = getBenchmarkHighlighter(q);
|
|
||||||
}
|
|
||||||
for (int m = 0; m < traversalSize; m++) {
|
|
||||||
int id = scoreDocs[m].doc;
|
|
||||||
res++;
|
|
||||||
if (retrieve) {
|
|
||||||
Document document = retrieveDoc(reader, id);
|
|
||||||
res += document != null ? 1 : 0;
|
|
||||||
if (numHighlight > 0 && m < numHighlight) {
|
|
||||||
Collection<String> fieldsToHighlight = getFieldsToHighlight(document);
|
|
||||||
for (final String field : fieldsToHighlight) {
|
|
||||||
String text = document.get(field);
|
|
||||||
res += highlighter.doHighlight(reader, id, field, document, analyzer, text);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -185,6 +159,28 @@ public abstract class ReadTask extends PerfTask {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||||
|
IndexReader reader = searcher.getIndexReader();
|
||||||
|
int res = 0;
|
||||||
|
if (withTraverse()) {
|
||||||
|
final ScoreDoc[] scoreDocs = hits.scoreDocs;
|
||||||
|
int traversalSize = Math.min(scoreDocs.length, traversalSize());
|
||||||
|
|
||||||
|
if (traversalSize > 0) {
|
||||||
|
boolean retrieve = withRetrieve();
|
||||||
|
for (int m = 0; m < traversalSize; m++) {
|
||||||
|
int id = scoreDocs[m].doc;
|
||||||
|
res++;
|
||||||
|
if (retrieve) {
|
||||||
|
Document document = retrieveDoc(reader, id);
|
||||||
|
res += document != null ? 1 : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
protected Collector createCollector() throws Exception {
|
protected Collector createCollector() throws Exception {
|
||||||
return TopScoreDocCollector.create(numHits());
|
return TopScoreDocCollector.create(numHits());
|
||||||
}
|
}
|
||||||
|
@ -267,39 +263,8 @@ public abstract class ReadTask extends PerfTask {
|
||||||
*/
|
*/
|
||||||
public abstract boolean withRetrieve();
|
public abstract boolean withRetrieve();
|
||||||
|
|
||||||
/**
|
|
||||||
* Set to the number of documents to highlight.
|
|
||||||
*
|
|
||||||
* @return The number of the results to highlight. O means no docs will be highlighted.
|
|
||||||
*/
|
|
||||||
public int numToHighlight() {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return an appropriate highlighter to be used with
|
|
||||||
* highlighting tasks
|
|
||||||
*/
|
|
||||||
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Sort getSort() {
|
protected Sort getSort() {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Define the fields to highlight. Base implementation returns all fields
|
|
||||||
* @param document The Document
|
|
||||||
* @return A Collection of Field names (Strings)
|
|
||||||
*/
|
|
||||||
protected Collection<String> getFieldsToHighlight(Document document) {
|
|
||||||
List<IndexableField> fields = document.getFields();
|
|
||||||
Set<String> result = new HashSet<>(fields.size());
|
|
||||||
for (final IndexableField f : fields) {
|
|
||||||
result.add(f.name());
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,65 +14,98 @@
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.benchmark.byTask.tasks;
|
package org.apache.lucene.benchmark.byTask.tasks;
|
||||||
|
|
||||||
|
import java.text.BreakIterator;
|
||||||
import java.util.Collection;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.index.Fields;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexableField;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
import org.apache.lucene.search.highlight.DefaultEncoder;
|
||||||
|
import org.apache.lucene.search.highlight.Encoder;
|
||||||
import org.apache.lucene.search.highlight.Highlighter;
|
import org.apache.lucene.search.highlight.Highlighter;
|
||||||
import org.apache.lucene.search.highlight.QueryScorer;
|
import org.apache.lucene.search.highlight.QueryScorer;
|
||||||
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||||
import org.apache.lucene.search.highlight.TextFragment;
|
|
||||||
import org.apache.lucene.search.highlight.TokenSources;
|
import org.apache.lucene.search.highlight.TokenSources;
|
||||||
|
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
|
||||||
|
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
|
||||||
|
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||||
|
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
|
||||||
|
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
|
||||||
|
import org.apache.lucene.search.vectorhighlight.FieldQuery;
|
||||||
|
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
|
||||||
|
import org.apache.lucene.search.vectorhighlight.WeightedFragListBuilder;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
|
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
|
||||||
*
|
*
|
||||||
* Uses the {@link org.apache.lucene.search.highlight.SimpleHTMLFormatter} for formatting.
|
|
||||||
*
|
|
||||||
* <p>Note: This task reuses the reader if it is already open.
|
* <p>Note: This task reuses the reader if it is already open.
|
||||||
* Otherwise a reader is opened at start and closed at the end.
|
* Otherwise a reader is opened at start and closed at the end.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* <p>Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]</p>
|
* <p>Takes optional multivalued, comma separated param string as: type[<enum>],maxFrags[<int>],fields[name1;name2;...]</p>
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
|
* <li>type - the highlighter implementation, e.g. "UH"</li>
|
||||||
* <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
|
|
||||||
* <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
|
* <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
|
||||||
* <li>mergeContiguous - true if contiguous fragments should be merged.</li>
|
|
||||||
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
|
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
* Example:
|
* Example:
|
||||||
* <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
|
* <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(type[UH],maxFrags[3],fields[body]) > : 1000
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
*
|
||||||
* Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well.
|
* Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well,
|
||||||
|
* and offsets in postings is another option.
|
||||||
*
|
*
|
||||||
* <p>Other side effects: counts additional 1 (record) for each traversed hit,
|
* <p>Other side effects: counts additional 1 (record) for each traversed hit,
|
||||||
* and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
|
* and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
|
||||||
*/
|
*/
|
||||||
public class SearchTravRetHighlightTask extends SearchTravTask {
|
public class SearchTravRetHighlightTask extends SearchTravTask {
|
||||||
|
private int maxDocCharsToAnalyze; // max leading content chars to highlight
|
||||||
protected int numToHighlight = Integer.MAX_VALUE;
|
private int maxFrags = 1; // aka passages
|
||||||
protected boolean mergeContiguous;
|
private Set<String> hlFields = Collections.singleton("body");
|
||||||
protected int maxFrags = 2;
|
private String type;
|
||||||
protected Set<String> paramFields = Collections.emptySet();
|
private HLImpl hlImpl;
|
||||||
protected Highlighter highlighter;
|
private Analyzer analyzer;
|
||||||
protected int maxDocCharsToAnalyze;
|
|
||||||
|
|
||||||
public SearchTravRetHighlightTask(PerfRunData runData) {
|
public SearchTravRetHighlightTask(PerfRunData runData) {
|
||||||
super(runData);
|
super(runData);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setParams(String params) {
|
||||||
|
// can't call super because super doesn't understand our params syntax
|
||||||
|
this.params = params;
|
||||||
|
// TODO consider instead using data.getConfig().get("highlighter.*")?
|
||||||
|
String[] splits = params.split(",");
|
||||||
|
for (String split : splits) {
|
||||||
|
if (split.startsWith("type[") == true) {
|
||||||
|
type = split.substring("type[".length(), split.length() - 1);
|
||||||
|
} else if (split.startsWith("maxFrags[") == true) {
|
||||||
|
maxFrags = (int) Float.parseFloat(split.substring("maxFrags[".length(), split.length() - 1));
|
||||||
|
} else if (split.startsWith("fields[") == true) {
|
||||||
|
String fieldNames = split.substring("fields[".length(), split.length() - 1);
|
||||||
|
String[] fieldSplits = fieldNames.split(";");
|
||||||
|
hlFields = new HashSet<>(Arrays.asList(fieldSplits));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setup() throws Exception {
|
public void setup() throws Exception {
|
||||||
super.setup();
|
super.setup();
|
||||||
|
@ -82,72 +115,188 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
|
||||||
throw new Exception("doc.stored must be set to true");
|
throw new Exception("doc.stored must be set to true");
|
||||||
}
|
}
|
||||||
maxDocCharsToAnalyze = data.getConfig().get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
|
maxDocCharsToAnalyze = data.getConfig().get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
|
||||||
|
analyzer = data.getAnalyzer();
|
||||||
|
String type = this.type;
|
||||||
|
if (type == null) {
|
||||||
|
type = data.getConfig().get("highlighter", null);
|
||||||
|
}
|
||||||
|
switch (type) {
|
||||||
|
case "NONE": hlImpl = new NoHLImpl(); break;
|
||||||
|
case "SH_A": hlImpl = new StandardHLImpl(false); break;
|
||||||
|
case "SH_V": hlImpl = new StandardHLImpl(true); break;
|
||||||
|
|
||||||
|
case "FVH_V": hlImpl = new FastVectorHLImpl(); break;
|
||||||
|
|
||||||
|
case "UH": hlImpl = new UnifiedHLImpl(null); break;
|
||||||
|
case "UH_A": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.ANALYSIS); break;
|
||||||
|
case "UH_V": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.TERM_VECTORS); break;
|
||||||
|
case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break;
|
||||||
|
case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break;
|
||||||
|
|
||||||
|
case "PH_P": hlImpl = new PostingsHLImpl(); break;
|
||||||
|
|
||||||
|
default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// here is where we intercept ReadTask's logic to do the highlighting, and nothing else (no retrieval of all field vals)
|
||||||
@Override
|
@Override
|
||||||
public boolean withRetrieve() {
|
protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||||
return true;
|
hlImpl.withTopDocs(searcher, q, hits);
|
||||||
|
// note: it'd be nice if we knew the sum kilobytes of text across these hits so we could return that. It'd be a more
|
||||||
|
// useful number to gauge the amount of work. But given "average" document sizes and lots of queries, returning the
|
||||||
|
// number of docs is reasonable.
|
||||||
|
return hits.scoreDocs.length; // always return # scored docs.
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
private interface HLImpl {
|
||||||
public int numToHighlight() {
|
void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception;
|
||||||
return numToHighlight;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
private volatile int preventOptimizeAway = 0;
|
||||||
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
|
|
||||||
highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
|
private class StandardHLImpl implements HLImpl {
|
||||||
|
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>");
|
||||||
|
DefaultEncoder encoder = new DefaultEncoder();
|
||||||
|
Highlighter highlighter = new Highlighter(formatter, encoder, null);
|
||||||
|
boolean termVecs;
|
||||||
|
|
||||||
|
StandardHLImpl(boolean termVecs) {
|
||||||
|
highlighter.setEncoder(new DefaultEncoder());
|
||||||
highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
|
highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
|
||||||
return new BenchmarkHighlighter(){
|
this.termVecs = termVecs;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int doHighlight(IndexReader reader, int doc, String field,
|
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||||
Document document, Analyzer analyzer, String text) throws Exception {
|
IndexReader reader = searcher.getIndexReader();
|
||||||
final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
|
highlighter.setFragmentScorer(new QueryScorer(q));
|
||||||
TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
|
// highlighter.setTextFragmenter(); unfortunately no sentence mechanism, not even regex. Default here is trivial
|
||||||
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
|
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
|
||||||
return frag != null ? frag.length : 0;
|
Document document = reader.document(scoreDoc.doc, hlFields);
|
||||||
|
Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null;
|
||||||
|
for (IndexableField indexableField : document) {
|
||||||
|
TokenStream tokenStream;
|
||||||
|
if (termVecs) {
|
||||||
|
tokenStream = TokenSources.getTokenStream(indexableField.name(), tvFields,
|
||||||
|
indexableField.stringValue(), analyzer, maxDocCharsToAnalyze);
|
||||||
|
} else {
|
||||||
|
tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue());
|
||||||
|
}
|
||||||
|
// will close TokenStream:
|
||||||
|
String[] fragments = highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags);
|
||||||
|
preventOptimizeAway = fragments.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class FastVectorHLImpl implements HLImpl {
|
||||||
|
int fragSize = 100;
|
||||||
|
WeightedFragListBuilder fragListBuilder = new WeightedFragListBuilder();
|
||||||
|
BoundaryScanner bs = new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ENGLISH));
|
||||||
|
ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(bs);
|
||||||
|
String[] preTags = {"<em>"};
|
||||||
|
String[] postTags = {"</em>"};
|
||||||
|
Encoder encoder = new DefaultEncoder();// new SimpleHTMLEncoder();
|
||||||
|
FastVectorHighlighter highlighter = new FastVectorHighlighter(
|
||||||
|
true, // phraseHighlight
|
||||||
|
false); // requireFieldMatch -- not pertinent to our benchmark
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||||
|
IndexReader reader = searcher.getIndexReader();
|
||||||
|
final FieldQuery fq = highlighter.getFieldQuery( q, reader);
|
||||||
|
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
|
||||||
|
for (String hlField : hlFields) {
|
||||||
|
String[] fragments = highlighter.getBestFragments(fq, reader, scoreDoc.doc, hlField, fragSize, maxFrags,
|
||||||
|
fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
|
||||||
|
preventOptimizeAway = fragments.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private ScoreDoc[] docIdOrder(ScoreDoc[] scoreDocs) {
|
||||||
|
ScoreDoc[] clone = new ScoreDoc[scoreDocs.length];
|
||||||
|
System.arraycopy(scoreDocs, 0, clone, 0, scoreDocs.length);
|
||||||
|
ArrayUtil.introSort(clone, (a, b) -> Integer.compare(a.doc, b.doc));
|
||||||
|
return clone;
|
||||||
|
}
|
||||||
|
|
||||||
|
private class PostingsHLImpl implements HLImpl {
|
||||||
|
PostingsHighlighter highlighter;
|
||||||
|
String[] fields = hlFields.toArray(new String[hlFields.size()]);
|
||||||
|
int[] maxPassages;
|
||||||
|
PostingsHLImpl() {
|
||||||
|
highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) {
|
||||||
|
@Override
|
||||||
|
protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards
|
||||||
|
return analyzer;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected BreakIterator getBreakIterator(String field) {
|
||||||
|
return BreakIterator.getSentenceInstance(Locale.ENGLISH);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
maxPassages = new int[hlFields.size()];
|
||||||
|
Arrays.fill(maxPassages, maxFrags);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> getFieldsToHighlight(Document document) {
|
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||||
Collection<String> result = super.getFieldsToHighlight(document);
|
Map<String, String[]> result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages);
|
||||||
//if stored is false, then result will be empty, in which case just get all the param fields
|
preventOptimizeAway = result.size();
|
||||||
if (paramFields.isEmpty() == false && result.isEmpty() == false) {
|
|
||||||
result.retainAll(paramFields);
|
|
||||||
} else {
|
|
||||||
result = paramFields;
|
|
||||||
}
|
}
|
||||||
return result;
|
}
|
||||||
|
|
||||||
|
private class UnifiedHLImpl implements HLImpl {
|
||||||
|
UnifiedHighlighter highlighter;
|
||||||
|
IndexSearcher lastSearcher;
|
||||||
|
UnifiedHighlighter.OffsetSource offsetSource; // null means auto select
|
||||||
|
String[] fields = hlFields.toArray(new String[hlFields.size()]);
|
||||||
|
int[] maxPassages;
|
||||||
|
|
||||||
|
UnifiedHLImpl(final UnifiedHighlighter.OffsetSource offsetSource) {
|
||||||
|
this.offsetSource = offsetSource;
|
||||||
|
maxPassages = new int[hlFields.size()];
|
||||||
|
Arrays.fill(maxPassages, maxFrags);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void reset(IndexSearcher searcher) {
|
||||||
|
if (lastSearcher == searcher) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
lastSearcher = searcher;
|
||||||
|
highlighter = new UnifiedHighlighter(searcher, analyzer) {
|
||||||
|
@Override
|
||||||
|
protected OffsetSource getOffsetSource(String field) {
|
||||||
|
return offsetSource != null ? offsetSource : super.getOffsetSource(field);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
highlighter.setBreakIterator(() -> BreakIterator.getSentenceInstance(Locale.ENGLISH));
|
||||||
|
highlighter.setMaxLength(maxDocCharsToAnalyze);
|
||||||
|
highlighter.setHighlightPhrasesStrictly(true);
|
||||||
|
highlighter.setHandleMultiTermQuery(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setParams(String params) {
|
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||||
// can't call super because super doesn't understand our
|
reset(searcher);
|
||||||
// params syntax
|
Map<String, String[]> result = highlighter.highlightFields(fields, q, hits, maxPassages);
|
||||||
this.params = params;
|
preventOptimizeAway = result.size();
|
||||||
String [] splits = params.split(",");
|
|
||||||
for (int i = 0; i < splits.length; i++) {
|
|
||||||
if (splits[i].startsWith("size[") == true){
|
|
||||||
traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
|
|
||||||
} else if (splits[i].startsWith("highlight[") == true){
|
|
||||||
numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
|
|
||||||
} else if (splits[i].startsWith("maxFrags[") == true){
|
|
||||||
maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
|
|
||||||
} else if (splits[i].startsWith("mergeContiguous[") == true){
|
|
||||||
mergeContiguous = Boolean.valueOf(splits[i].substring("mergeContiguous[".length(),splits[i].length() - 1)).booleanValue();
|
|
||||||
} else if (splits[i].startsWith("fields[") == true){
|
|
||||||
paramFields = new HashSet<>();
|
|
||||||
String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
|
|
||||||
String [] fieldSplits = fieldNames.split(";");
|
|
||||||
for (int j = 0; j < fieldSplits.length; j++) {
|
|
||||||
paramFields.add(fieldSplits[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private class NoHLImpl implements HLImpl {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||||
|
//just retrieve the HL fields
|
||||||
|
for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
|
||||||
|
preventOptimizeAway += searcher.doc(scoreDoc.doc, hlFields).iterator().hasNext() ? 2 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,147 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.benchmark.byTask.tasks;
|
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
|
|
||||||
import org.apache.lucene.search.vectorhighlight.FieldQuery;
|
|
||||||
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Collections;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents by using FastVectorHighlighter.
|
|
||||||
*
|
|
||||||
* <p>Note: This task reuses the reader if it is already open.
|
|
||||||
* Otherwise a reader is opened at start and closed at the end.
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* <p>Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]</p>
|
|
||||||
* <ul>
|
|
||||||
* <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
|
|
||||||
* <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
|
|
||||||
* <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
|
|
||||||
* <li>fragSize - The length of fragments</li>
|
|
||||||
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
|
|
||||||
* </ul>
|
|
||||||
* Example:
|
|
||||||
* <pre>"SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(size[10],highlight[10],maxFrags[3],fields[body]) > : 1000
|
|
||||||
* </pre>
|
|
||||||
*
|
|
||||||
* Fields must be stored and term vector offsets and positions in order must be true for this task to work.
|
|
||||||
*
|
|
||||||
* <p>Other side effects: counts additional 1 (record) for each traversed hit,
|
|
||||||
* and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
|
|
||||||
*/
|
|
||||||
public class SearchTravRetVectorHighlightTask extends SearchTravTask {
|
|
||||||
|
|
||||||
protected int numToHighlight = Integer.MAX_VALUE;
|
|
||||||
protected int maxFrags = 2;
|
|
||||||
protected int fragSize = 100;
|
|
||||||
protected Set<String> paramFields = Collections.emptySet();
|
|
||||||
protected FastVectorHighlighter highlighter;
|
|
||||||
|
|
||||||
public SearchTravRetVectorHighlightTask(PerfRunData runData) {
|
|
||||||
super(runData);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setup() throws Exception {
|
|
||||||
super.setup();
|
|
||||||
//check to make sure either the doc is being stored
|
|
||||||
PerfRunData data = getRunData();
|
|
||||||
if (data.getConfig().get("doc.stored", false) == false){
|
|
||||||
throw new Exception("doc.stored must be set to true");
|
|
||||||
}
|
|
||||||
if (data.getConfig().get("doc.term.vector.offsets", false) == false){
|
|
||||||
throw new Exception("doc.term.vector.offsets must be set to true");
|
|
||||||
}
|
|
||||||
if (data.getConfig().get("doc.term.vector.positions", false) == false){
|
|
||||||
throw new Exception("doc.term.vector.positions must be set to true");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean withRetrieve() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int numToHighlight() {
|
|
||||||
return numToHighlight;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
|
|
||||||
highlighter = new FastVectorHighlighter( false, false );
|
|
||||||
final Query myq = q;
|
|
||||||
return new BenchmarkHighlighter(){
|
|
||||||
@Override
|
|
||||||
public int doHighlight(IndexReader reader, int doc, String field,
|
|
||||||
Document document, Analyzer analyzer, String text) throws Exception {
|
|
||||||
final FieldQuery fq = highlighter.getFieldQuery( myq, reader);
|
|
||||||
String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags);
|
|
||||||
return fragments != null ? fragments.length : 0;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Collection<String> getFieldsToHighlight(Document document) {
|
|
||||||
Collection<String> result = super.getFieldsToHighlight(document);
|
|
||||||
//if stored is false, then result will be empty, in which case just get all the param fields
|
|
||||||
if (paramFields.isEmpty() == false && result.isEmpty() == false) {
|
|
||||||
result.retainAll(paramFields);
|
|
||||||
} else {
|
|
||||||
result = paramFields;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setParams(String params) {
|
|
||||||
// can't call super because super doesn't understand our
|
|
||||||
// params syntax
|
|
||||||
final String [] splits = params.split(",");
|
|
||||||
for (int i = 0; i < splits.length; i++) {
|
|
||||||
if (splits[i].startsWith("size[") == true){
|
|
||||||
traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
|
|
||||||
} else if (splits[i].startsWith("highlight[") == true){
|
|
||||||
numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
|
|
||||||
} else if (splits[i].startsWith("maxFrags[") == true){
|
|
||||||
maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
|
|
||||||
} else if (splits[i].startsWith("fragSize[") == true){
|
|
||||||
fragSize = (int)Float.parseFloat(splits[i].substring("fragSize[".length(),splits[i].length() - 1));
|
|
||||||
} else if (splits[i].startsWith("fields[") == true){
|
|
||||||
paramFields = new HashSet<>();
|
|
||||||
String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
|
|
||||||
String [] fieldSplits = fieldNames.split(";");
|
|
||||||
for (int j = 0; j < fieldSplits.length; j++) {
|
|
||||||
paramFields.add(fieldSplits[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -31,9 +31,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
|
|
||||||
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
|
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
|
||||||
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
|
|
||||||
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
|
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
|
||||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||||
import org.apache.lucene.collation.CollationKeyAnalyzer;
|
import org.apache.lucene.collation.CollationKeyAnalyzer;
|
||||||
|
@ -159,110 +157,6 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
|
||||||
//assertTrue(CountingSearchTestTask.numSearches > 0);
|
//assertTrue(CountingSearchTestTask.numSearches > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testHighlighting() throws Exception {
|
|
||||||
// 1. alg definition (required in every "logic" test)
|
|
||||||
String algLines[] = {
|
|
||||||
"doc.stored=true",
|
|
||||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
|
||||||
"docs.file=" + getReuters20LinesFile(),
|
|
||||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
|
||||||
"ResetSystemErase",
|
|
||||||
"CreateIndex",
|
|
||||||
"{ AddDoc } : 100",
|
|
||||||
"ForceMerge(1)",
|
|
||||||
"CloseIndex",
|
|
||||||
"OpenReader",
|
|
||||||
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
|
|
||||||
"CloseReader",
|
|
||||||
};
|
|
||||||
|
|
||||||
// 2. we test this value later
|
|
||||||
CountingHighlighterTestTask.numHighlightedResults = 0;
|
|
||||||
CountingHighlighterTestTask.numDocsRetrieved = 0;
|
|
||||||
// 3. execute the algorithm (required in every "logic" test)
|
|
||||||
Benchmark benchmark = execBenchmark(algLines);
|
|
||||||
|
|
||||||
// 4. test specific checks after the benchmark run completed.
|
|
||||||
assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
|
|
||||||
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
|
|
||||||
//we probably should use a different doc/query maker, but...
|
|
||||||
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
|
|
||||||
|
|
||||||
assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
|
|
||||||
// now we should be able to open the index for write.
|
|
||||||
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
|
|
||||||
iw.close();
|
|
||||||
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
|
|
||||||
assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs());
|
|
||||||
ir.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testHighlightingTV() throws Exception {
|
|
||||||
// 1. alg definition (required in every "logic" test)
|
|
||||||
String algLines[] = {
|
|
||||||
"doc.stored=true",//doc storage is required in order to have text to highlight
|
|
||||||
"doc.term.vector=true",
|
|
||||||
"doc.term.vector.offsets=true",
|
|
||||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
|
||||||
"docs.file=" + getReuters20LinesFile(),
|
|
||||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
|
||||||
"ResetSystemErase",
|
|
||||||
"CreateIndex",
|
|
||||||
"{ AddDoc } : 1000",
|
|
||||||
"ForceMerge(1)",
|
|
||||||
"CloseIndex",
|
|
||||||
"OpenReader",
|
|
||||||
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
|
|
||||||
"CloseReader",
|
|
||||||
};
|
|
||||||
|
|
||||||
// 2. we test this value later
|
|
||||||
CountingHighlighterTestTask.numHighlightedResults = 0;
|
|
||||||
CountingHighlighterTestTask.numDocsRetrieved = 0;
|
|
||||||
// 3. execute the algorithm (required in every "logic" test)
|
|
||||||
Benchmark benchmark = execBenchmark(algLines);
|
|
||||||
|
|
||||||
// 4. test specific checks after the benchmark run completed.
|
|
||||||
assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
|
|
||||||
//pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
|
|
||||||
//we probably should use a different doc/query maker, but...
|
|
||||||
assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
|
|
||||||
|
|
||||||
assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
|
|
||||||
// now we should be able to open the index for write.
|
|
||||||
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
|
|
||||||
iw.close();
|
|
||||||
IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
|
|
||||||
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
|
|
||||||
ir.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testHighlightingNoTvNoStore() throws Exception {
|
|
||||||
// 1. alg definition (required in every "logic" test)
|
|
||||||
String algLines[] = {
|
|
||||||
"doc.stored=false",
|
|
||||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
|
||||||
"docs.file=" + getReuters20LinesFile(),
|
|
||||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
|
||||||
"ResetSystemErase",
|
|
||||||
"CreateIndex",
|
|
||||||
"{ AddDoc } : 1000",
|
|
||||||
"ForceMerge(1)",
|
|
||||||
"CloseIndex",
|
|
||||||
"OpenReader",
|
|
||||||
"{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
|
|
||||||
"CloseReader",
|
|
||||||
};
|
|
||||||
|
|
||||||
// 2. we test this value later
|
|
||||||
CountingHighlighterTestTask.numHighlightedResults = 0;
|
|
||||||
CountingHighlighterTestTask.numDocsRetrieved = 0;
|
|
||||||
// 3. execute the algorithm (required in every "logic" test)
|
|
||||||
expectThrows(Exception.class, () -> {
|
|
||||||
execBenchmark(algLines);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test Exhasting Doc Maker logic
|
* Test Exhasting Doc Maker logic
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -1,68 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.benchmark.byTask.tasks;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.search.highlight.Highlighter;
|
|
||||||
import org.apache.lucene.search.highlight.QueryScorer;
|
|
||||||
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
|
||||||
import org.apache.lucene.search.highlight.TextFragment;
|
|
||||||
import org.apache.lucene.search.highlight.TokenSources;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test Search task which counts number of searches.
|
|
||||||
*/
|
|
||||||
public class CountingHighlighterTestTask extends SearchTravRetHighlightTask {
|
|
||||||
|
|
||||||
public static int numHighlightedResults = 0;
|
|
||||||
public static int numDocsRetrieved = 0;
|
|
||||||
|
|
||||||
public CountingHighlighterTestTask(PerfRunData runData) {
|
|
||||||
super(runData);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
|
|
||||||
Document document = ir.document(id);
|
|
||||||
if (document != null) {
|
|
||||||
numDocsRetrieved++;
|
|
||||||
}
|
|
||||||
return document;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public BenchmarkHighlighter getBenchmarkHighlighter(Query q) {
|
|
||||||
highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
|
|
||||||
return new BenchmarkHighlighter() {
|
|
||||||
@Override
|
|
||||||
public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception {
|
|
||||||
final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
|
|
||||||
TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
|
|
||||||
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
|
|
||||||
numHighlightedResults += frag != null ? frag.length : 0;
|
|
||||||
return frag != null ? frag.length : 0;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue