diff --git a/build.xml b/build.xml index 247ba60d2b2..0d25615d3af 100644 --- a/build.xml +++ b/build.xml @@ -192,6 +192,8 @@ // excludes: exclude(name: '**/build/**') exclude(name: '**/dist/**') + exclude(name: 'lucene/benchmark/work/**') + exclude(name: 'lucene/benchmark/temp/**') exclude(name: '**/CheckLoggingConfiguration.java') exclude(name: 'build.xml') // ourselves :-) } diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0a65d204306..bdb4998496d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -76,6 +76,9 @@ Other * LUCENE-7452: Block join query exception suggests how to find a doc, which violates orthogonality requirement. (Mikhail Khludnev) +* LUCENE-7438: Renovate the Benchmark module's support for benchmarking highlighting. All + highlighters are supported via SearchTravRetHighlight. (David Smiley) + Build * LUCENE-7292: Fix build to use "--release 8" instead of "-release 8" on diff --git a/lucene/benchmark/.gitignore b/lucene/benchmark/.gitignore index 6cac9b7cfad..a20524a7293 100644 --- a/lucene/benchmark/.gitignore +++ b/lucene/benchmark/.gitignore @@ -1,2 +1,2 @@ -temp/ -work/ \ No newline at end of file +/temp +/work \ No newline at end of file diff --git a/lucene/benchmark/README.enwiki b/lucene/benchmark/README.enwiki index f9d49300e87..7ad07a8b62e 100644 --- a/lucene/benchmark/README.enwiki +++ b/lucene/benchmark/README.enwiki @@ -13,10 +13,13 @@ writing, there is a page file in http://download.wikimedia.org/enwiki/20070402/. You can download this file manually and put it in temp. Note that the file you download will probably have the date in the name, e.g., -http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When -you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2. +http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. + +If you use the EnwikiContentSource then the data will be decompressed on the fly +during the benchmark. If you want to benchmark indexing, you should probably decompress +it beforehand using the "enwiki" Ant target which will produce a work/enwiki.txt, after +which you can use LineDocSource in your benchmark. After that, ant enwiki should process the data set and run a load -test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can -also be used to download, decompress, and extract (to individual files +test. Ant target enwiki will download, decompress, and extract (to individual files in work/enwiki) the dataset, respectively. diff --git a/lucene/benchmark/conf/highlight-vs-vector-highlight.alg b/lucene/benchmark/conf/highlight-vs-vector-highlight.alg deleted file mode 100644 index cc4382d8185..00000000000 --- a/lucene/benchmark/conf/highlight-vs-vector-highlight.alg +++ /dev/null @@ -1,80 +0,0 @@ -#/** -# * Licensed to the Apache Software Foundation (ASF) under one or more -# * contributor license agreements. See the NOTICE file distributed with -# * this work for additional information regarding copyright ownership. -# * The ASF licenses this file to You under the Apache License, Version 2.0 -# * (the "License"); you may not use this file except in compliance with -# * the License. You may obtain a copy of the License at -# * -# * http://www.apache.org/licenses/LICENSE-2.0 -# * -# * Unless required by applicable law or agreed to in writing, software -# * distributed under the License is distributed on an "AS IS" BASIS, -# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# * See the License for the specific language governing permissions and -# * limitations under the License. -# */ -# ------------------------------------------------------------------------------------- - -ram.flush.mb=flush:32:32 -compound=cmpnd:true:false - -analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer -directory=FSDirectory - -doc.stored=true -doc.tokenized=true -doc.term.vector=true -doc.term.vector.offsets=true -doc.term.vector.positions=true -log.step=2000 - -docs.dir=reuters-out - -content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource -docs.file=temp/enwiki-20070527-pages-articles.xml - -query.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiQueryMaker -enwikiQueryMaker.disableSpanQueries=true - -max.field.length=2147483647 -highlighter.maxDocCharsToAnalyze=2147483647 - -# task at this depth or less would print when they start -task.max.depth.log=2 - -log.queries=true -# ------------------------------------------------------------------------------------- -{ "Populate" - CreateIndex - { "MAddDocs" AddDoc } : 20000 - ForceMerge(1) - CloseIndex -} -{ - OpenReader - { "WarmTV" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 100 - CloseReader -} -{ - "Rounds" - - ResetSystemSoft - - OpenReader - { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 200 - CloseReader - - ResetSystemSoft - - OpenReader - { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[3],fields[body]) > : 200 - CloseReader - - RepSumByPref Search - - NewRound -} : 4 - -RepSumByNameRound -RepSumByName diff --git a/lucene/benchmark/conf/vector-highlight-profile.alg b/lucene/benchmark/conf/highlighters-postings.alg similarity index 51% rename from lucene/benchmark/conf/vector-highlight-profile.alg rename to lucene/benchmark/conf/highlighters-postings.alg index 4348783b50a..cf9df118786 100644 --- a/lucene/benchmark/conf/vector-highlight-profile.alg +++ b/lucene/benchmark/conf/highlighters-postings.alg @@ -14,55 +14,52 @@ # * See the License for the specific language governing permissions and # * limitations under the License. # */ -# ------------------------------------------------------------------------------------- -# multi val params are iterated by NewRound's, added to reports, start with column name. -ram.flush.mb=flush:32:32 -compound=cmpnd:true:false +# For postings-offsets with light term-vectors analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory +work.dir=work/enwikiPostings +ram.flush.mb=64 +compound=false doc.stored=true doc.tokenized=true +# offsets in postings: +doc.body.offsets=true +# term vector, but no positions/offsets with it doc.term.vector=true -doc.term.vector.offsets=true -doc.term.vector.positions=true -log.step=2000 -docs.dir=reuters-out +content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource +docs.file=temp/enwiki-20070527-pages-articles.xml.bz2 -content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource +query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker +file.query.maker.file=conf/query-phrases.txt +log.queries=false +log.step.SearchTravRetHighlight=-1 -query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker +highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV -# task at this depth or less would print when they start -task.max.depth.log=2 - -log.queries=true -# ------------------------------------------------------------------------------------- { "Populate" CreateIndex - { "MAddDocs" AddDoc } : 20000 - ForceMerge(1) + [{ "MAddDocs" AddDoc > : 50000] : 4 CloseIndex - } -{ "Rounds" + } : 0 - ResetSystemSoft +{ + "Rounds" + ResetSystemSoft - OpenReader - { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000 + OpenReader - CloseReader + { "Warm" SearchTravRetHighlight > : 1000 - RepSumByPref MAddDocs + { "HL" SearchTravRetHighlight > : 500 - NewRound + CloseReader -} : 4 + NewRound +} : 6 -RepSumByNameRound -RepSumByName -RepSumByPrefRound MAddDocs +RepSumByPrefRound HL \ No newline at end of file diff --git a/lucene/benchmark/conf/highlight-profile.alg b/lucene/benchmark/conf/highlighters-tv.alg similarity index 54% rename from lucene/benchmark/conf/highlight-profile.alg rename to lucene/benchmark/conf/highlighters-tv.alg index b62644cde31..1e51018e37d 100644 --- a/lucene/benchmark/conf/highlight-profile.alg +++ b/lucene/benchmark/conf/highlighters-tv.alg @@ -14,55 +14,51 @@ # * See the License for the specific language governing permissions and # * limitations under the License. # */ -# ------------------------------------------------------------------------------------- -# multi val params are iterated by NewRound's, added to reports, start with column name. -ram.flush.mb=flush:32:32 -compound=cmpnd:true:false +# This is a full-term vector configuration. analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory +work.dir=work/enwikiTermVec +ram.flush.mb=64 +compound=false doc.stored=true doc.tokenized=true doc.term.vector=true -doc.term.vector.offsets=true doc.term.vector.positions=true -log.step=2000 +doc.term.vector.offsets=true -docs.dir=reuters-out +content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource +docs.file=temp/enwiki-20070527-pages-articles.xml.bz2 -content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource +query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker +file.query.maker.file=conf/query-terms.txt +log.queries=false +log.step.SearchTravRetHighlight=-1 -query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker +highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V -# task at this depth or less would print when they start -task.max.depth.log=2 - -log.queries=true -# ------------------------------------------------------------------------------------- { "Populate" CreateIndex - { "MAddDocs" AddDoc } : 20000 - ForceMerge(1) + [{ "MAddDocs" AddDoc > : 50000] : 4 CloseIndex - } -{ "Rounds" + } : 0 - ResetSystemSoft +{ + "Rounds" + ResetSystemSoft - OpenReader - { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000 + OpenReader - CloseReader + { "Warm" SearchTravRetHighlight > : 1000 - RepSumByPref MAddDocs + { "HL" SearchTravRetHighlight > : 500 - NewRound + CloseReader + NewRound } : 4 -RepSumByNameRound -RepSumByName -RepSumByPrefRound MAddDocs +RepSumByPrefRound HL \ No newline at end of file diff --git a/lucene/benchmark/conf/standard-highlights-notv.alg b/lucene/benchmark/conf/highlights.alg similarity index 93% rename from lucene/benchmark/conf/standard-highlights-notv.alg rename to lucene/benchmark/conf/highlights.alg index 040e1ef1370..88b056ecee4 100644 --- a/lucene/benchmark/conf/standard-highlights-notv.alg +++ b/lucene/benchmark/conf/highlights.alg @@ -54,7 +54,7 @@ log.queries=true { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000 CloseReader OpenReader - { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000 + { "SearchHlgtSameRdr" SearchTravRetHighlight(type[UH]) > : 1000 CloseReader diff --git a/lucene/benchmark/conf/query-phrases.txt b/lucene/benchmark/conf/query-phrases.txt new file mode 100644 index 00000000000..b479663b2f2 --- /dev/null +++ b/lucene/benchmark/conf/query-phrases.txt @@ -0,0 +1,10 @@ +"Abraham Lincoln" +"Union Wisconsin" +"court of law" +"Field Theory" OR "Set Theory" +"Top 100" +"red hot chili" +"greatest guitarists" +"Planes, Trains & Automobiles" OR ships +"international airport" +"Xbox 360" \ No newline at end of file diff --git a/lucene/benchmark/conf/query-terms.txt b/lucene/benchmark/conf/query-terms.txt new file mode 100644 index 00000000000..c57bace06f3 --- /dev/null +++ b/lucene/benchmark/conf/query-terms.txt @@ -0,0 +1,10 @@ +Abraham AND Lincoln +Union AND Wisconsin +court AND law +top AND 100 +(field OR set) AND theory +red AND hot AND chili +greatest AND guitarists +(planes AND trains AND automobiles) OR ships +international AND airport +xbox AND 360 \ No newline at end of file diff --git a/lucene/benchmark/conf/query-wildcards.txt b/lucene/benchmark/conf/query-wildcards.txt new file mode 100644 index 00000000000..06685c63605 --- /dev/null +++ b/lucene/benchmark/conf/query-wildcards.txt @@ -0,0 +1,7 @@ +abrah* AND linc* +court* AND law* +(field OR set) AND theor* +red AND hot AND chili* +great* AND guitar* +(plan* AND train* AND automob*) OR ship* +international AND airport* \ No newline at end of file diff --git a/lucene/benchmark/conf/standard-highlights-tv.alg b/lucene/benchmark/conf/standard-highlights-tv.alg deleted file mode 100644 index 3cd18b8df84..00000000000 --- a/lucene/benchmark/conf/standard-highlights-tv.alg +++ /dev/null @@ -1,69 +0,0 @@ -#/** -# * Licensed to the Apache Software Foundation (ASF) under one or more -# * contributor license agreements. See the NOTICE file distributed with -# * this work for additional information regarding copyright ownership. -# * The ASF licenses this file to You under the Apache License, Version 2.0 -# * (the "License"); you may not use this file except in compliance with -# * the License. You may obtain a copy of the License at -# * -# * http://www.apache.org/licenses/LICENSE-2.0 -# * -# * Unless required by applicable law or agreed to in writing, software -# * distributed under the License is distributed on an "AS IS" BASIS, -# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# * See the License for the specific language governing permissions and -# * limitations under the License. -# */ -# ------------------------------------------------------------------------------------- -# multi val params are iterated by NewRound's, added to reports, start with column name. - -ram.flush.mb=flush:32:32 -compound=cmpnd:true:false - -analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer -directory=FSDirectory - -doc.stored=true -doc.tokenized=true -doc.term.vector=true -doc.term.vector.offsets=true -doc.term.vector.positions=true -log.step=2000 - -docs.dir=reuters-out - -content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource - -query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker - -# task at this depth or less would print when they start -task.max.depth.log=2 - -log.queries=true -# ------------------------------------------------------------------------------------- -{ "Populate" - CreateIndex - { "MAddDocs" AddDoc } : 20000 - ForceMerge(1) - CloseIndex -} -{ "Rounds" - - ResetSystemSoft - OpenReader - { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000 - CloseReader - OpenReader - { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000 - - CloseReader - - RepSumByPref SearchHlgtSameRdr - - NewRound - -} : 2 - -RepSumByNameRound -RepSumByName -RepSumByPrefRound MAddDocs diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java index 1d4b6433a82..a08b79e38cf 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java @@ -349,6 +349,8 @@ public class PerfRunData implements Closeable { // Hold reference to new IR indexReader.incRef(); indexSearcher = new IndexSearcher(indexReader); + // TODO Some day we should make the query cache in this module configurable and control clearing the cache + indexSearcher.setQueryCache(null); } else { indexSearcher = null; } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java index 4afafc321c7..2c722a792ce 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java @@ -43,6 +43,7 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexOptions; /** * Creates {@link Document} objects. Uses a {@link ContentSource} to generate @@ -58,6 +59,8 @@ import org.apache.lucene.document.TextField; * (default true). *
Note: This task reuses the reader if it is already open. * Otherwise a reader is opened at start and closed at the end. *
* - *Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]
+ *Takes optional multivalued, comma separated param string as: type[<enum>],maxFrags[<int>],fields[name1;name2;...]
*"SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000 + *"SearchHlgtSameRdr" SearchTravRetHighlight(type[UH],maxFrags[3],fields[body]) > : 1000 ** - * Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well. + * Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well, + * and offsets in postings is another option. * *Other side effects: counts additional 1 (record) for each traversed hit, * and 1 more for each retrieved (non null) document and 1 for each fragment returned.
*/ public class SearchTravRetHighlightTask extends SearchTravTask { - - protected int numToHighlight = Integer.MAX_VALUE; - protected boolean mergeContiguous; - protected int maxFrags = 2; - protected SetparamFields = Collections.emptySet(); - protected Highlighter highlighter; - protected int maxDocCharsToAnalyze; + private int maxDocCharsToAnalyze; // max leading content chars to highlight + private int maxFrags = 1; // aka passages + private Set hlFields = Collections.singleton("body"); + private String type; + private HLImpl hlImpl; + private Analyzer analyzer; public SearchTravRetHighlightTask(PerfRunData runData) { super(runData); } + @Override + public void setParams(String params) { + // can't call super because super doesn't understand our params syntax + this.params = params; + // TODO consider instead using data.getConfig().get("highlighter.*")? + String[] splits = params.split(","); + for (String split : splits) { + if (split.startsWith("type[") == true) { + type = split.substring("type[".length(), split.length() - 1); + } else if (split.startsWith("maxFrags[") == true) { + maxFrags = (int) Float.parseFloat(split.substring("maxFrags[".length(), split.length() - 1)); + } else if (split.startsWith("fields[") == true) { + String fieldNames = split.substring("fields[".length(), split.length() - 1); + String[] fieldSplits = fieldNames.split(";"); + hlFields = new HashSet<>(Arrays.asList(fieldSplits)); + } + } + } + @Override public void setup() throws Exception { super.setup(); @@ -82,72 +115,188 @@ public class SearchTravRetHighlightTask extends SearchTravTask { throw new Exception("doc.stored must be set to true"); } maxDocCharsToAnalyze = data.getConfig().get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); - } - - @Override - public boolean withRetrieve() { - return true; - } - - @Override - public int numToHighlight() { - return numToHighlight; - } - - @Override - protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ - highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); - highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); - return new BenchmarkHighlighter(){ - @Override - public int doHighlight(IndexReader reader, int doc, String field, - Document document, Analyzer analyzer, String text) throws Exception { - final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1; - TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset); - TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags); - return frag != null ? frag.length : 0; - } - }; - } - - @Override - protected Collection getFieldsToHighlight(Document document) { - Collection result = super.getFieldsToHighlight(document); - //if stored is false, then result will be empty, in which case just get all the param fields - if (paramFields.isEmpty() == false && result.isEmpty() == false) { - result.retainAll(paramFields); - } else { - result = paramFields; + analyzer = data.getAnalyzer(); + String type = this.type; + if (type == null) { + type = data.getConfig().get("highlighter", null); + } + switch (type) { + case "NONE": hlImpl = new NoHLImpl(); break; + case "SH_A": hlImpl = new StandardHLImpl(false); break; + case "SH_V": hlImpl = new StandardHLImpl(true); break; + + case "FVH_V": hlImpl = new FastVectorHLImpl(); break; + + case "UH": hlImpl = new UnifiedHLImpl(null); break; + case "UH_A": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.ANALYSIS); break; + case "UH_V": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.TERM_VECTORS); break; + case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break; + case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break; + + case "PH_P": hlImpl = new PostingsHLImpl(); break; + + default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')"); } - return result; } + // here is where we intercept ReadTask's logic to do the highlighting, and nothing else (no retrieval of all field vals) @Override - public void setParams(String params) { - // can't call super because super doesn't understand our - // params syntax - this.params = params; - String [] splits = params.split(","); - for (int i = 0; i < splits.length; i++) { - if (splits[i].startsWith("size[") == true){ - traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1)); - } else if (splits[i].startsWith("highlight[") == true){ - numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1)); - } else if (splits[i].startsWith("maxFrags[") == true){ - maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1)); - } else if (splits[i].startsWith("mergeContiguous[") == true){ - mergeContiguous = Boolean.valueOf(splits[i].substring("mergeContiguous[".length(),splits[i].length() - 1)).booleanValue(); - } else if (splits[i].startsWith("fields[") == true){ - paramFields = new HashSet<>(); - String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1); - String [] fieldSplits = fieldNames.split(";"); - for (int j = 0; j < fieldSplits.length; j++) { - paramFields.add(fieldSplits[j]); + protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { + hlImpl.withTopDocs(searcher, q, hits); + // note: it'd be nice if we knew the sum kilobytes of text across these hits so we could return that. It'd be a more + // useful number to gauge the amount of work. But given "average" document sizes and lots of queries, returning the + // number of docs is reasonable. + return hits.scoreDocs.length; // always return # scored docs. + } + + private interface HLImpl { + void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception; + } + + private volatile int preventOptimizeAway = 0; + + private class StandardHLImpl implements HLImpl { + SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("", ""); + DefaultEncoder encoder = new DefaultEncoder(); + Highlighter highlighter = new Highlighter(formatter, encoder, null); + boolean termVecs; + + StandardHLImpl(boolean termVecs) { + highlighter.setEncoder(new DefaultEncoder()); + highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); + this.termVecs = termVecs; + } + + @Override + public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { + IndexReader reader = searcher.getIndexReader(); + highlighter.setFragmentScorer(new QueryScorer(q)); + // highlighter.setTextFragmenter(); unfortunately no sentence mechanism, not even regex. Default here is trivial + for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) { + Document document = reader.document(scoreDoc.doc, hlFields); + Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null; + for (IndexableField indexableField : document) { + TokenStream tokenStream; + if (termVecs) { + tokenStream = TokenSources.getTokenStream(indexableField.name(), tvFields, + indexableField.stringValue(), analyzer, maxDocCharsToAnalyze); + } else { + tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue()); + } + // will close TokenStream: + String[] fragments = highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags); + preventOptimizeAway = fragments.length; + } + } + } + } + + private class FastVectorHLImpl implements HLImpl { + int fragSize = 100; + WeightedFragListBuilder fragListBuilder = new WeightedFragListBuilder(); + BoundaryScanner bs = new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ENGLISH)); + ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(bs); + String[] preTags = {""}; + String[] postTags = {""}; + Encoder encoder = new DefaultEncoder();// new SimpleHTMLEncoder(); + FastVectorHighlighter highlighter = new FastVectorHighlighter( + true, // phraseHighlight + false); // requireFieldMatch -- not pertinent to our benchmark + + @Override + public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { + IndexReader reader = searcher.getIndexReader(); + final FieldQuery fq = highlighter.getFieldQuery( q, reader); + for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) { + for (String hlField : hlFields) { + String[] fragments = highlighter.getBestFragments(fq, reader, scoreDoc.doc, hlField, fragSize, maxFrags, + fragListBuilder, fragmentsBuilder, preTags, postTags, encoder); + preventOptimizeAway = fragments.length; + } + } + } + } + + private ScoreDoc[] docIdOrder(ScoreDoc[] scoreDocs) { + ScoreDoc[] clone = new ScoreDoc[scoreDocs.length]; + System.arraycopy(scoreDocs, 0, clone, 0, scoreDocs.length); + ArrayUtil.introSort(clone, (a, b) -> Integer.compare(a.doc, b.doc)); + return clone; + } + + private class PostingsHLImpl implements HLImpl { + PostingsHighlighter highlighter; + String[] fields = hlFields.toArray(new String[hlFields.size()]); + int[] maxPassages; + PostingsHLImpl() { + highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) { + @Override + protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards + return analyzer; } - } + @Override + protected BreakIterator getBreakIterator(String field) { + return BreakIterator.getSentenceInstance(Locale.ENGLISH); + } + }; + maxPassages = new int[hlFields.size()]; + Arrays.fill(maxPassages, maxFrags); + } + + @Override + public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { + Map result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages); + preventOptimizeAway = result.size(); } } + private class UnifiedHLImpl implements HLImpl { + UnifiedHighlighter highlighter; + IndexSearcher lastSearcher; + UnifiedHighlighter.OffsetSource offsetSource; // null means auto select + String[] fields = hlFields.toArray(new String[hlFields.size()]); + int[] maxPassages; + UnifiedHLImpl(final UnifiedHighlighter.OffsetSource offsetSource) { + this.offsetSource = offsetSource; + maxPassages = new int[hlFields.size()]; + Arrays.fill(maxPassages, maxFrags); + } + + private void reset(IndexSearcher searcher) { + if (lastSearcher == searcher) { + return; + } + lastSearcher = searcher; + highlighter = new UnifiedHighlighter(searcher, analyzer) { + @Override + protected OffsetSource getOffsetSource(String field) { + return offsetSource != null ? offsetSource : super.getOffsetSource(field); + } + }; + highlighter.setBreakIterator(() -> BreakIterator.getSentenceInstance(Locale.ENGLISH)); + highlighter.setMaxLength(maxDocCharsToAnalyze); + highlighter.setHighlightPhrasesStrictly(true); + highlighter.setHandleMultiTermQuery(true); + } + + @Override + public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { + reset(searcher); + Map result = highlighter.highlightFields(fields, q, hits, maxPassages); + preventOptimizeAway = result.size(); + } + } + + private class NoHLImpl implements HLImpl { + + @Override + public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { + //just retrieve the HL fields + for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) { + preventOptimizeAway += searcher.doc(scoreDoc.doc, hlFields).iterator().hasNext() ? 2 : 1; + } + } + } } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java deleted file mode 100644 index 15a13ca3624..00000000000 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.benchmark.byTask.tasks; - - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.benchmark.byTask.PerfRunData; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; -import org.apache.lucene.search.vectorhighlight.FieldQuery; - -import java.util.Set; -import java.util.Collection; -import java.util.HashSet; -import java.util.Collections; - -/** - * Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents by using FastVectorHighlighter. - * - * Note: This task reuses the reader if it is already open. - * Otherwise a reader is opened at start and closed at the end. - *
- * - *Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]
- *- *
- * Example: - *- traversal size - The number of hits to traverse, otherwise all will be traversed
- *- highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())
- *- maxFrags - The maximum number of fragments to score by the highlighter
- *- fragSize - The length of fragments
- *- fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)
- *"SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(size[10],highlight[10],maxFrags[3],fields[body]) > : 1000 - *- * - * Fields must be stored and term vector offsets and positions in order must be true for this task to work. - * - *Other side effects: counts additional 1 (record) for each traversed hit, - * and 1 more for each retrieved (non null) document and 1 for each fragment returned.
- */ -public class SearchTravRetVectorHighlightTask extends SearchTravTask { - - protected int numToHighlight = Integer.MAX_VALUE; - protected int maxFrags = 2; - protected int fragSize = 100; - protected SetparamFields = Collections.emptySet(); - protected FastVectorHighlighter highlighter; - - public SearchTravRetVectorHighlightTask(PerfRunData runData) { - super(runData); - } - - @Override - public void setup() throws Exception { - super.setup(); - //check to make sure either the doc is being stored - PerfRunData data = getRunData(); - if (data.getConfig().get("doc.stored", false) == false){ - throw new Exception("doc.stored must be set to true"); - } - if (data.getConfig().get("doc.term.vector.offsets", false) == false){ - throw new Exception("doc.term.vector.offsets must be set to true"); - } - if (data.getConfig().get("doc.term.vector.positions", false) == false){ - throw new Exception("doc.term.vector.positions must be set to true"); - } - } - - @Override - public boolean withRetrieve() { - return true; - } - - @Override - public int numToHighlight() { - return numToHighlight; - } - - @Override - protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ - highlighter = new FastVectorHighlighter( false, false ); - final Query myq = q; - return new BenchmarkHighlighter(){ - @Override - public int doHighlight(IndexReader reader, int doc, String field, - Document document, Analyzer analyzer, String text) throws Exception { - final FieldQuery fq = highlighter.getFieldQuery( myq, reader); - String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags); - return fragments != null ? fragments.length : 0; - } - }; - } - - @Override - protected Collection getFieldsToHighlight(Document document) { - Collection result = super.getFieldsToHighlight(document); - //if stored is false, then result will be empty, in which case just get all the param fields - if (paramFields.isEmpty() == false && result.isEmpty() == false) { - result.retainAll(paramFields); - } else { - result = paramFields; - } - return result; - } - - @Override - public void setParams(String params) { - // can't call super because super doesn't understand our - // params syntax - final String [] splits = params.split(","); - for (int i = 0; i < splits.length; i++) { - if (splits[i].startsWith("size[") == true){ - traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1)); - } else if (splits[i].startsWith("highlight[") == true){ - numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1)); - } else if (splits[i].startsWith("maxFrags[") == true){ - maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1)); - } else if (splits[i].startsWith("fragSize[") == true){ - fragSize = (int)Float.parseFloat(splits[i].substring("fragSize[".length(),splits[i].length() - 1)); - } else if (splits[i].startsWith("fields[") == true){ - paramFields = new HashSet<>(); - String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1); - String [] fieldSplits = fieldNames.split(";"); - for (int j = 0; j < fieldSplits.length; j++) { - paramFields.add(fieldSplits[j]); - } - - } - } - } -} diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java index 5dbf6609585..3d483f3954c 100644 --- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java +++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java @@ -31,9 +31,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; -import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker; import org.apache.lucene.benchmark.byTask.stats.TaskStats; -import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; import org.apache.lucene.collation.CollationKeyAnalyzer; @@ -159,110 +157,6 @@ public class TestPerfTasksLogic extends BenchmarkTestCase { //assertTrue(CountingSearchTestTask.numSearches > 0); } - public void testHighlighting() throws Exception { - // 1. alg definition (required in every "logic" test) - String algLines[] = { - "doc.stored=true", - "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", - "docs.file=" + getReuters20LinesFile(), - "query.maker=" + ReutersQueryMaker.class.getName(), - "ResetSystemErase", - "CreateIndex", - "{ AddDoc } : 100", - "ForceMerge(1)", - "CloseIndex", - "OpenReader", - "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200", - "CloseReader", - }; - - // 2. we test this value later - CountingHighlighterTestTask.numHighlightedResults = 0; - CountingHighlighterTestTask.numDocsRetrieved = 0; - // 3. execute the algorithm (required in every "logic" test) - Benchmark benchmark = execBenchmark(algLines); - - // 4. test specific checks after the benchmark run completed. - assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved); - //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs - //we probably should use a different doc/query maker, but... - assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0); - - assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory())); - // now we should be able to open the index for write. - IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND)); - iw.close(); - IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); - assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs()); - ir.close(); - } - - public void testHighlightingTV() throws Exception { - // 1. alg definition (required in every "logic" test) - String algLines[] = { - "doc.stored=true",//doc storage is required in order to have text to highlight - "doc.term.vector=true", - "doc.term.vector.offsets=true", - "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", - "docs.file=" + getReuters20LinesFile(), - "query.maker=" + ReutersQueryMaker.class.getName(), - "ResetSystemErase", - "CreateIndex", - "{ AddDoc } : 1000", - "ForceMerge(1)", - "CloseIndex", - "OpenReader", - "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200", - "CloseReader", - }; - - // 2. we test this value later - CountingHighlighterTestTask.numHighlightedResults = 0; - CountingHighlighterTestTask.numDocsRetrieved = 0; - // 3. execute the algorithm (required in every "logic" test) - Benchmark benchmark = execBenchmark(algLines); - - // 4. test specific checks after the benchmark run completed. - assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved); - //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs - //we probably should use a different doc/query maker, but... - assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0); - - assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory())); - // now we should be able to open the index for write. - IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND)); - iw.close(); - IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); - assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs()); - ir.close(); - } - - public void testHighlightingNoTvNoStore() throws Exception { - // 1. alg definition (required in every "logic" test) - String algLines[] = { - "doc.stored=false", - "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", - "docs.file=" + getReuters20LinesFile(), - "query.maker=" + ReutersQueryMaker.class.getName(), - "ResetSystemErase", - "CreateIndex", - "{ AddDoc } : 1000", - "ForceMerge(1)", - "CloseIndex", - "OpenReader", - "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200", - "CloseReader", - }; - - // 2. we test this value later - CountingHighlighterTestTask.numHighlightedResults = 0; - CountingHighlighterTestTask.numDocsRetrieved = 0; - // 3. execute the algorithm (required in every "logic" test) - expectThrows(Exception.class, () -> { - execBenchmark(algLines); - }); - } - /** * Test Exhasting Doc Maker logic */ diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java deleted file mode 100644 index da322dfc377..00000000000 --- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.benchmark.byTask.tasks; - -import java.io.IOException; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.benchmark.byTask.PerfRunData; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.highlight.Highlighter; -import org.apache.lucene.search.highlight.QueryScorer; -import org.apache.lucene.search.highlight.SimpleHTMLFormatter; -import org.apache.lucene.search.highlight.TextFragment; -import org.apache.lucene.search.highlight.TokenSources; - -/** - * Test Search task which counts number of searches. - */ -public class CountingHighlighterTestTask extends SearchTravRetHighlightTask { - - public static int numHighlightedResults = 0; - public static int numDocsRetrieved = 0; - - public CountingHighlighterTestTask(PerfRunData runData) { - super(runData); - } - - @Override - protected Document retrieveDoc(IndexReader ir, int id) throws IOException { - Document document = ir.document(id); - if (document != null) { - numDocsRetrieved++; - } - return document; - } - - @Override - public BenchmarkHighlighter getBenchmarkHighlighter(Query q) { - highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); - return new BenchmarkHighlighter() { - @Override - public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception { - final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1; - TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset); - TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags); - numHighlightedResults += frag != null ? frag.length : 0; - return frag != null ? frag.length : 0; - } - }; - } -}