From d7d455246fbb7414f39820f4f5deb417ea4421e6 Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Thu, 18 Jun 2009 19:58:59 +0000 Subject: [PATCH] LUCENE-1595: Separate DocMaker into DocMaker and ContentSource. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@786233 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/benchmark/CHANGES.txt | 28 ++ contrib/benchmark/conf/analyzer.alg | 5 +- contrib/benchmark/conf/autoCommit.alg | 2 +- contrib/benchmark/conf/compound-penalty.alg | 7 +- contrib/benchmark/conf/createLineFile.alg | 4 +- contrib/benchmark/conf/deletepercent.alg | 5 +- contrib/benchmark/conf/deletes.alg | 8 +- contrib/benchmark/conf/extractWikipedia.alg | 2 +- contrib/benchmark/conf/highlight-profile.alg | 4 +- contrib/benchmark/conf/indexLineFile.alg | 2 +- .../indexing-flush-by-RAM-multithreaded.alg | 6 +- .../benchmark/conf/indexing-flush-by-RAM.alg | 6 +- .../benchmark/conf/indexing-multithreaded.alg | 6 +- contrib/benchmark/conf/indexing.alg | 6 +- .../conf/micro-standard-flush-by-ram.alg | 6 +- contrib/benchmark/conf/micro-standard.alg | 6 +- contrib/benchmark/conf/readContentSource.alg | 45 ++ contrib/benchmark/conf/sample.alg | 6 +- contrib/benchmark/conf/sloppy-phrase.alg | 6 +- contrib/benchmark/conf/sort-standard.alg | 4 +- .../benchmark/conf/standard-flush-by-RAM.alg | 6 +- .../conf/standard-highlights-notv.alg | 4 +- .../benchmark/conf/standard-highlights-tv.alg | 4 +- contrib/benchmark/conf/standard.alg | 6 +- contrib/benchmark/conf/tokenize.alg | 4 +- .../benchmark/conf/wikipedia-flush-by-RAM.alg | 2 +- contrib/benchmark/conf/wikipedia.alg | 2 +- contrib/benchmark/conf/wikipediaOneRound.alg | 2 +- .../lucene/benchmark/byTask/PerfRunData.java | 30 +- .../benchmark/byTask/feeds/BasicDocMaker.java | 335 --------------- .../benchmark/byTask/feeds/ContentSource.java | 201 +++++++++ .../byTask/feeds/DemoHTMLParser.java | 23 +- ...DirDocMaker.java => DirContentSource.java} | 189 +++++---- .../benchmark/byTask/feeds/DocData.java | 113 +++-- .../benchmark/byTask/feeds/DocMaker.java | 396 ++++++++++++++++-- .../byTask/feeds/EnwikiContentSource.java | 294 +++++++++++++ .../byTask/feeds/EnwikiDocMaker.java | 308 ++------------ .../byTask/feeds/FileBasedQueryMaker.java | 5 +- .../benchmark/byTask/feeds/HTMLParser.java | 4 +- .../benchmark/byTask/feeds/LineDocMaker.java | 236 ++--------- .../benchmark/byTask/feeds/LineDocSource.java | 116 +++++ .../byTask/feeds/ReutersContentSource.java | 147 +++++++ .../byTask/feeds/ReutersDocMaker.java | 135 ------ .../byTask/feeds/ReutersQueryMaker.java | 4 +- .../byTask/feeds/SimpleQueryMaker.java | 8 +- .../feeds/SimpleSloppyPhraseQueryMaker.java | 6 +- ...mpleDocMaker.java => SingleDocSource.java} | 39 +- .../byTask/feeds/SortableSimpleDocMaker.java | 56 --- .../byTask/feeds/SortableSingleDocSource.java | 95 +++++ .../byTask/feeds/TrecContentSource.java | 339 +++++++++++++++ .../benchmark/byTask/feeds/TrecDocMaker.java | 262 ------------ .../benchmark/byTask/programmatic/Sample.java | 5 +- .../benchmark/byTask/tasks/AddDocTask.java | 36 +- .../tasks/ConsumeContentSourceTask.java | 67 +++ .../benchmark/byTask/tasks/DeleteDocTask.java | 33 +- .../benchmark/byTask/tasks/PerfTask.java | 73 +++- .../byTask/tasks/ReadTokensTask.java | 51 +-- .../benchmark/byTask/tasks/TaskSequence.java | 5 +- .../benchmark/byTask/tasks/UpdateDocTask.java | 34 +- .../byTask/tasks/WriteLineDocTask.java | 44 +- .../byTask/utils/StringBufferReader.java | 173 ++++++++ .../benchmark/utils/ExtractWikipedia.java | 17 +- .../benchmark/byTask/TestPerfTasksLogic.java | 110 ++--- .../byTask/feeds/LineDocMakerTest.java | 24 -- ...erTest.java => TrecContentSourceTest.java} | 87 ++-- .../byTask/tasks/WriteLineDocTaskTest.java | 4 +- .../benchmark/quality/TestQualityRun.java | 8 +- 67 files changed, 2410 insertions(+), 1896 deletions(-) create mode 100644 contrib/benchmark/conf/readContentSource.alg delete mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java rename contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/{DirDocMaker.java => DirContentSource.java} (64%) create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java delete mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java rename contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/{SimpleDocMaker.java => SingleDocSource.java} (81%) delete mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSimpleDocMaker.java create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSingleDocSource.java create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java delete mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java rename contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/{TrecDocMakerTest.java => TrecContentSourceTest.java} (84%) diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt index f0212f5d821..bebbd350e57 100644 --- a/contrib/benchmark/CHANGES.txt +++ b/contrib/benchmark/CHANGES.txt @@ -3,6 +3,34 @@ Lucene Benchmark Contrib Change Log The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways. $Id:$ +6/17/09 + LUCENE-1595: This issue breaks previous external algorithms. DocMaker has been + replaced with a concrete class which accepts a ContentSource for iterating over + a content source's documents. Most of the old DocMakers were changed to a + ContentSource implementation, and DocMaker is now a default document creation impl + that provides an easy way for reusing fields. When [doc.maker] is not defined in + an algorithm, the new DocMaker is the default. If you have .alg files which + specify a DocMaker (like ReutersDocMaker), you should change the [doc.maker] line to: + [content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource] + + i.e. + doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker + becomes + content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource + + doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker + becomes + content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource + + Also, PerfTask now logs a message in tearDown() rather than each Task doing its + own logging. A new setting called [log.step] is consulted to determine how often + to log. [doc.add.log.step] is no longer a valid setting. For easy migration of + current .alg files, rename [doc.add.log.step] to [log.step] and [doc.delete.log.step] + to [delete.log.step]. + + Additionally, [doc.maker.forever] should be changed to [content.source.forever]. + (Shai Erera via Mark Miller) + 6/12/09 LUCENE-1539: Added DeleteByPercentTask which enables deleting a percentage of documents and searching on them. Changed CommitIndex diff --git a/contrib/benchmark/conf/analyzer.alg b/contrib/benchmark/conf/analyzer.alg index 7005d328309..1a1ec4cb5aa 100644 --- a/contrib/benchmark/conf/analyzer.alg +++ b/contrib/benchmark/conf/analyzer.alg @@ -30,13 +30,12 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/autoCommit.alg b/contrib/benchmark/conf/autoCommit.alg index 41dda883c25..63e0e00a5a7 100644 --- a/contrib/benchmark/conf/autoCommit.alg +++ b/contrib/benchmark/conf/autoCommit.alg @@ -38,7 +38,7 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=5000 +log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml diff --git a/contrib/benchmark/conf/compound-penalty.alg b/contrib/benchmark/conf/compound-penalty.alg index ff4d8b69004..ec37704dac1 100644 --- a/contrib/benchmark/conf/compound-penalty.alg +++ b/contrib/benchmark/conf/compound-penalty.alg @@ -34,14 +34,13 @@ directory=FSDirectory doc.stored=stored:true:true:false:false doc.tokenized=true doc.term.vector=vector:true:true:false:false -doc.add.log.step=500 -doc.delete.log.step=100 +log.step=500 +delete.log.step=100 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/createLineFile.alg b/contrib/benchmark/conf/createLineFile.alg index dae602811ea..969f30762df 100644 --- a/contrib/benchmark/conf/createLineFile.alg +++ b/contrib/benchmark/conf/createLineFile.alg @@ -29,13 +29,13 @@ # # Where to get documents from: -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource # Where to write the line file output: line.file.out=work/reuters.lines.txt # Stop after processing the document feed once: -doc.maker.forever=false +content.source.forever=false # ------------------------------------------------------------------------------------- diff --git a/contrib/benchmark/conf/deletepercent.alg b/contrib/benchmark/conf/deletepercent.alg index 66d87cf3603..cdd1f967f02 100644 --- a/contrib/benchmark/conf/deletepercent.alg +++ b/contrib/benchmark/conf/deletepercent.alg @@ -25,13 +25,14 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 #doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/deletes.alg b/contrib/benchmark/conf/deletes.alg index 652c67fe7ee..31144131e61 100644 --- a/contrib/benchmark/conf/deletes.alg +++ b/contrib/benchmark/conf/deletes.alg @@ -32,14 +32,14 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=10000 -doc.delete.log.step=100 +log.step=10000 +delete.log.step=100 docs.dir=reuters-out #docs.dir=reuters-111 -doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/extractWikipedia.alg b/contrib/benchmark/conf/extractWikipedia.alg index 0754e0a374e..a341a941031 100644 --- a/contrib/benchmark/conf/extractWikipedia.alg +++ b/contrib/benchmark/conf/extractWikipedia.alg @@ -36,7 +36,7 @@ docs.file=temp/enwiki-20070527-pages-articles.xml line.file.out=work/enwiki.txt # Stop after processing the document feed once: -doc.maker.forever=false +content.source.forever=false # ------------------------------------------------------------------------------------- diff --git a/contrib/benchmark/conf/highlight-profile.alg b/contrib/benchmark/conf/highlight-profile.alg index 9e7d55e1de4..234ebb1e841 100644 --- a/contrib/benchmark/conf/highlight-profile.alg +++ b/contrib/benchmark/conf/highlight-profile.alg @@ -28,11 +28,11 @@ doc.tokenized=true doc.term.vector=true doc.term.vector.offsets=true doc.term.vector.positions=true -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/indexLineFile.alg b/contrib/benchmark/conf/indexLineFile.alg index 52c4af1e0c6..5b69a9cc575 100644 --- a/contrib/benchmark/conf/indexLineFile.alg +++ b/contrib/benchmark/conf/indexLineFile.alg @@ -38,7 +38,7 @@ doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker docs.file=work/reuters.lines.txt # Process documents only once: -doc.maker.forever=false +content.source.forever=false # ------------------------------------------------------------------------------------- diff --git a/contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg b/contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg index 66c6b791a47..2c16cee5bf6 100644 --- a/contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg +++ b/contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg @@ -30,13 +30,13 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/indexing-flush-by-RAM.alg b/contrib/benchmark/conf/indexing-flush-by-RAM.alg index 5a100b4101d..c2d3fb4e2c9 100644 --- a/contrib/benchmark/conf/indexing-flush-by-RAM.alg +++ b/contrib/benchmark/conf/indexing-flush-by-RAM.alg @@ -30,13 +30,13 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/indexing-multithreaded.alg b/contrib/benchmark/conf/indexing-multithreaded.alg index dda0a287716..27adde38e59 100644 --- a/contrib/benchmark/conf/indexing-multithreaded.alg +++ b/contrib/benchmark/conf/indexing-multithreaded.alg @@ -30,13 +30,13 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/indexing.alg b/contrib/benchmark/conf/indexing.alg index 16f21938195..9deccdc3c2a 100644 --- a/contrib/benchmark/conf/indexing.alg +++ b/contrib/benchmark/conf/indexing.alg @@ -30,13 +30,13 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/micro-standard-flush-by-ram.alg b/contrib/benchmark/conf/micro-standard-flush-by-ram.alg index d01723ee6e8..0d2c6853d3e 100644 --- a/contrib/benchmark/conf/micro-standard-flush-by-ram.alg +++ b/contrib/benchmark/conf/micro-standard-flush-by-ram.alg @@ -29,13 +29,13 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/micro-standard.alg b/contrib/benchmark/conf/micro-standard.alg index a51d0c2a42c..e0a554a24f8 100644 --- a/contrib/benchmark/conf/micro-standard.alg +++ b/contrib/benchmark/conf/micro-standard.alg @@ -28,13 +28,13 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/readContentSource.alg b/contrib/benchmark/conf/readContentSource.alg new file mode 100644 index 00000000000..9923af039de --- /dev/null +++ b/contrib/benchmark/conf/readContentSource.alg @@ -0,0 +1,45 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- + +# +# This alg reads the information from a ContentSoruce. It is useful for +# measuring the performance of a particular ContentSource implementation, or +# gather baselines for operations like indexing (if reading from the content +# source takes 'X' time, we cannot index faster). +# +# To use this, first cd to contrib/benchmark and then run: +# +# ant run-task -Dtask.alg=conf/readContentSource.alg +# + +# Where to get documents from: +content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource +docs.file=temp/enwiki-20070527-pages-articles.xml.bz2 + +# Stop after processing the document feed once: +content.source.forever=false + +# Log messages every: +log.step=100000 + +# ------------------------------------------------------------------------------------- + +# Process all documents, appending each one to the line file: +{ ConsumeContentSource } : * + +RepSumByPref ConsumeContentSource diff --git a/contrib/benchmark/conf/sample.alg b/contrib/benchmark/conf/sample.alg index c36b4f6c332..c7b9f25fb00 100644 --- a/contrib/benchmark/conf/sample.alg +++ b/contrib/benchmark/conf/sample.alg @@ -40,13 +40,13 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 -doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/sloppy-phrase.alg b/contrib/benchmark/conf/sloppy-phrase.alg index 71d5853166e..f0caad72599 100644 --- a/contrib/benchmark/conf/sloppy-phrase.alg +++ b/contrib/benchmark/conf/sloppy-phrase.alg @@ -28,13 +28,13 @@ directory=FSDirectory doc.stored=false doc.tokenized=true doc.term.vector=false -doc.add.log.step=500 +log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 -doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleSloppyPhraseQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker diff --git a/contrib/benchmark/conf/sort-standard.alg b/contrib/benchmark/conf/sort-standard.alg index 029962d5904..98e72ad3382 100644 --- a/contrib/benchmark/conf/sort-standard.alg +++ b/contrib/benchmark/conf/sort-standard.alg @@ -29,11 +29,11 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=100000 +log.step=100000 docs.dir=reuters-out -doc.maker=org.apache.lucene.benchmark.byTask.feeds.SortableSimpleDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker diff --git a/contrib/benchmark/conf/standard-flush-by-RAM.alg b/contrib/benchmark/conf/standard-flush-by-RAM.alg index 0727876efb5..ba60ac8247c 100644 --- a/contrib/benchmark/conf/standard-flush-by-RAM.alg +++ b/contrib/benchmark/conf/standard-flush-by-RAM.alg @@ -29,13 +29,13 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/standard-highlights-notv.alg b/contrib/benchmark/conf/standard-highlights-notv.alg index a39b67e1713..889f5d744f1 100644 --- a/contrib/benchmark/conf/standard-highlights-notv.alg +++ b/contrib/benchmark/conf/standard-highlights-notv.alg @@ -28,11 +28,11 @@ doc.tokenized=true doc.term.vector=false doc.term.vector.offsets=false doc.term.vector.positions=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/standard-highlights-tv.alg b/contrib/benchmark/conf/standard-highlights-tv.alg index f6e59d46011..8c7f5339a06 100644 --- a/contrib/benchmark/conf/standard-highlights-tv.alg +++ b/contrib/benchmark/conf/standard-highlights-tv.alg @@ -28,11 +28,11 @@ doc.tokenized=true doc.term.vector=true doc.term.vector.offsets=true doc.term.vector.positions=true -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/standard.alg b/contrib/benchmark/conf/standard.alg index b43c3e5b631..173b6eb629e 100644 --- a/contrib/benchmark/conf/standard.alg +++ b/contrib/benchmark/conf/standard.alg @@ -28,13 +28,13 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=2000 +log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 -#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker +#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker diff --git a/contrib/benchmark/conf/tokenize.alg b/contrib/benchmark/conf/tokenize.alg index 9d02862164b..57951ffb38d 100644 --- a/contrib/benchmark/conf/tokenize.alg +++ b/contrib/benchmark/conf/tokenize.alg @@ -25,8 +25,8 @@ # ant run-task -Dtask.alg=conf/tokenize.alg # -doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker -doc.maker.forever=false +content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource +content.source.forever=false # diff --git a/contrib/benchmark/conf/wikipedia-flush-by-RAM.alg b/contrib/benchmark/conf/wikipedia-flush-by-RAM.alg index 2a63cc32e2f..17c7588b6a3 100644 --- a/contrib/benchmark/conf/wikipedia-flush-by-RAM.alg +++ b/contrib/benchmark/conf/wikipedia-flush-by-RAM.alg @@ -37,7 +37,7 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=5000 +log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml diff --git a/contrib/benchmark/conf/wikipedia.alg b/contrib/benchmark/conf/wikipedia.alg index fd7d9c04c2d..8076032afab 100644 --- a/contrib/benchmark/conf/wikipedia.alg +++ b/contrib/benchmark/conf/wikipedia.alg @@ -33,7 +33,7 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=5000 +log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml diff --git a/contrib/benchmark/conf/wikipediaOneRound.alg b/contrib/benchmark/conf/wikipediaOneRound.alg index 20e9fba4d50..68a5483856f 100644 --- a/contrib/benchmark/conf/wikipediaOneRound.alg +++ b/contrib/benchmark/conf/wikipediaOneRound.alg @@ -33,7 +33,7 @@ directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false -doc.add.log.step=5000 +log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java index 4611d49099a..b9e4b3704ba 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java @@ -17,9 +17,13 @@ package org.apache.lucene.benchmark.byTask; * limitations under the License. */ +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; -import org.apache.lucene.benchmark.byTask.feeds.HTMLParser; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; import org.apache.lucene.benchmark.byTask.stats.Points; import org.apache.lucene.benchmark.byTask.tasks.ReadTask; @@ -33,11 +37,6 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; -import java.io.File; -import java.util.HashMap; -import java.util.Iterator; - - /** * Data maintained by a performance test run. *

@@ -62,7 +61,6 @@ public class PerfRunData { private Directory directory; private Analyzer analyzer; private DocMaker docMaker; - private HTMLParser htmlParser; // we use separate (identical) instances for each "read" task type, so each can iterate the quries separately. private HashMap readTaskQueryMaker; @@ -82,14 +80,11 @@ public class PerfRunData { "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance(); // doc maker docMaker = (DocMaker) Class.forName(config.get("doc.maker", - "org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker")).newInstance(); + "org.apache.lucene.benchmark.byTask.feeds.DocMaker")).newInstance(); docMaker.setConfig(config); // query makers readTaskQueryMaker = new HashMap(); qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker")); - // html parser, used for some doc makers - htmlParser = (HTMLParser) Class.forName(config.get("html.parser","org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser")).newInstance(); - docMaker.setHTMLParser(htmlParser); // index stuff reinit(false); @@ -229,9 +224,7 @@ public class PerfRunData { this.analyzer = analyzer; } - /** - * @return Returns the docMaker. - */ + /** Returns the docMaker. */ public DocMaker getDocMaker() { return docMaker; } @@ -243,7 +236,7 @@ public class PerfRunData { return config; } - public void resetInputs() { + public void resetInputs() throws IOException { docMaker.resetInputs(); Iterator it = readTaskQueryMaker.values().iterator(); while (it.hasNext()) { @@ -271,11 +264,4 @@ public class PerfRunData { return qm; } - /** - * @return Returns the htmlParser. - */ - public HTMLParser getHtmlParser() { - return htmlParser; - } - } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java deleted file mode 100644 index 11b02be0ab6..00000000000 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java +++ /dev/null @@ -1,335 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.benchmark.byTask.utils.Format; -import org.apache.lucene.document.DateTools; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - -import java.io.File; -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Iterator; - - -/** - * Create documents for the test. - * Maintains counters of chars etc. so that sub-classes just need to - * provide textual content, and the create-by-size is handled here. - * - *

- * Config Params (default is in caps): - * doc.stored=true|FALSE
- * doc.tokenized=TRUE|false
- * doc.term.vector=true|FALSE
- * doc.term.vector.positions=true|FALSE
- * doc.term.vector.offsets=true|FALSE
- * doc.store.body.bytes=true|FALSE //Store the body contents raw UTF-8 bytes as a field
- */ -public abstract class BasicDocMaker implements DocMaker { - - private int numDocsCreated = 0; - private boolean storeBytes = false; - protected boolean forever; - - private static class LeftOver { - private DocData docdata; - private int cnt; - } - - // leftovers are thread local, because it is unsafe to share residues between threads - private ThreadLocal leftovr = new ThreadLocal(); - - public static final String BODY_FIELD = "body"; - public static final String TITLE_FIELD = "doctitle"; - public static final String DATE_FIELD = "docdate"; - public static final String ID_FIELD = "docid"; - public static final String BYTES_FIELD = "bytes"; - public static final String NAME_FIELD = "docname"; - - private long numBytes = 0; - private long numUniqueBytes = 0; - - protected Config config; - - protected Field.Store storeVal = Field.Store.NO; - protected Field.Index indexVal = Field.Index.ANALYZED; - protected Field.TermVector termVecVal = Field.TermVector.NO; - - private synchronized int incrNumDocsCreated() { - return numDocsCreated++; - } - - /** - * Return the data of the next document. - * All current implementations can create docs forever. - * When the input data is exhausted, input files are iterated. - * This re-iteration can be avoided by setting doc.maker.forever to false (default is true). - * @return data of the next document. - * @exception if cannot create the next doc data - * @exception NoMoreDataException if data is exhausted (and 'forever' set to false). - */ - protected abstract DocData getNextDocData() throws NoMoreDataException, Exception; - - /* - * (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument() - */ - public Document makeDocument () throws Exception { - resetLeftovers(); - DocData docData = getNextDocData(); - Document doc = createDocument(docData,0,-1); - return doc; - } - - // create a doc - // use only part of the body, modify it to keep the rest (or use all if size==0). - // reset the docdata properties so they are not added more than once. - private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException { - int docid = incrNumDocsCreated(); - Document doc = new Document(); - doc.add(new Field(ID_FIELD, "doc"+docid, storeVal, indexVal, termVecVal)); - if (docData.getName()!=null) { - String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt); - doc.add(new Field(NAME_FIELD, name, storeVal, indexVal, termVecVal)); - } - if (docData.getDate()!=null) { - String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND); - doc.add(new Field(DATE_FIELD, dateStr, storeVal, indexVal, termVecVal)); - } - if (docData.getTitle()!=null) { - doc.add(new Field(TITLE_FIELD, docData.getTitle(), storeVal, indexVal, termVecVal)); - } - if (docData.getBody()!=null && docData.getBody().length()>0) { - String bdy; - if (size<=0 || size>=docData.getBody().length()) { - bdy = docData.getBody(); // use all - docData.setBody(""); // nothing left - } else { - // attempt not to break words - if whitespace found within next 20 chars... - for (int n=size-1; n ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline); - int nut = numUniqueTexts(); - if (nut > lastPrintedNumUniqueTexts) { - print = true; - sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline); - lastPrintedNumUniqueTexts = nut; - } - long nub = numUniqueBytes(); - if (nub > lastPrintedNumUniqueBytes) { - print = true; - sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline); - lastPrintedNumUniqueBytes = nub; - } - if (getCount()>0) { - print = true; - sb.append("num docs added since last inputs reset: ").append(Format.format(0,getCount(),col)).append(newline); - sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getByteCount(),col)).append(newline); - } - if (print) { - System.out.println(sb.append(newline).toString()); - printNum++; - } - } - - protected void collectFiles(File f, ArrayList inputFiles) { - //System.out.println("Collect: "+f.getAbsolutePath()); - if (!f.canRead()) { - return; - } - if (f.isDirectory()) { - String files[] = f.list(); - Arrays.sort(files); - for (int i = 0; i < files.length; i++) { - collectFiles(new File(f,files[i]),inputFiles); - } - return; - } - inputFiles.add(f); - addUniqueBytes(f.length()); - } - - /* (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#setHTMLParser(org.apache.lucene.benchmark.byTask.feeds.HTMLParser) - */ - public void setHTMLParser(HTMLParser htmlParser) { - this.htmlParser = htmlParser; - } - - /* - * (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#getHtmlParser() - */ - public HTMLParser getHtmlParser() { - return htmlParser; - } - - -} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java new file mode 100644 index 00000000000..6625feeb35c --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java @@ -0,0 +1,201 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.apache.lucene.benchmark.byTask.utils.Config; + +/** + * Represents content from a specified source, such as TREC, Reuters etc. A + * {@link ContentSource} is responsible for creating {@link DocData} objects for + * its documents to be consumed by {@link ToDeleteDocMaker}. It also keeps track of + * various statistics, such as how many documents were generated, size in bytes + * etc. + *

+ * Supports the following configuration parameters: + *

+ */ +public abstract class ContentSource { + + private static final int BZIP = 0; + private static final int OTHER = 1; + private static final Map extensionToType = new HashMap(); + static { + extensionToType.put(".bz2", Integer.valueOf(BZIP)); + extensionToType.put(".bzip", Integer.valueOf(BZIP)); + } + + protected static final int BUFFER_SIZE = 1 << 16; // 64K + + private long bytesCount; + private long totalBytesCount; + private int docsCount; + private int totalDocsCount; + private Config config; + + protected boolean forever; + protected int logStep; + protected boolean verbose; + + private CompressorStreamFactory csFactory = new CompressorStreamFactory(); + + protected final synchronized void addBytes(long numBytes) { + bytesCount += numBytes; + totalBytesCount += numBytes; + } + + protected final synchronized void addDoc() { + ++docsCount; + ++totalDocsCount; + } + + /** + * A convenience method for collecting all the files of a content source from + * a given directory. The collected {@link File} instances are stored in the + * given files. + */ + protected final void collectFiles(File dir, ArrayList files) { + if (!dir.canRead()) { + return; + } + + File[] dirFiles = dir.listFiles(); + Arrays.sort(dirFiles); + for (int i = 0; i < dirFiles.length; i++) { + File file = dirFiles[i]; + if (file.isDirectory()) { + collectFiles(file, files); + } else if (file.canRead()) { + files.add(file); + } + } + } + + /** + * Returns an {@link InputStream} over the requested file. This method + * attempts to identify the appropriate {@link InputStream} instance to return + * based on the file name (e.g., if it ends with .bz2 or .bzip, return a + * 'bzip' {@link InputStream}). + */ + protected InputStream getInputStream(File file) throws IOException { + // First, create a FileInputStream, as this will be required by all types. + // Wrap with BufferedInputStream for better performance + InputStream is = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE); + + String fileName = file.getName(); + int idx = fileName.lastIndexOf('.'); + int type = OTHER; + if (idx != -1) { + Integer typeInt = (Integer) extensionToType.get(fileName.substring(idx)); + if (typeInt != null) { + type = typeInt.intValue(); + } + } + switch (type) { + case BZIP: + try { + // According to BZip2CompressorInputStream's code, it reads the first + // two file header chars ('B' and 'Z'). It is important to wrap the + // underlying input stream with a buffered one since + // Bzip2CompressorInputStream uses the read() method exclusively. + is = csFactory.createCompressorInputStream("bzip2", is); + } catch (CompressorException e) { + IOException ioe = new IOException(e.getMessage()); + ioe.initCause(e); + throw ioe; + } + break; + default: // Do nothing, stay with FileInputStream + } + + return is; + } + + /** + * Returns true whether it's time to log a message (depending on verbose and + * the number of documents generated). + */ + protected final boolean shouldLog() { + return verbose && logStep > 0 && docsCount % logStep == 0; + } + + /** Called when reading from this content source is no longer required. */ + public abstract void close() throws IOException; + + /** Returns the number of bytes generated since last reset. */ + public final long getBytesCount() { return bytesCount; } + + /** Returns the number of generated documents since last reset. */ + public final int getDocsCount() { return docsCount; } + + public final Config getConfig() { return config; } + + /** Returns the next {@link DocData} from the content source. */ + public abstract DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException; + + /** Returns the total number of bytes that were generated by this source. */ + public final long getTotalBytesCount() { return totalBytesCount; } + + /** Returns the total number of generated documents. */ + public final int getTotalDocsCount() { return totalDocsCount; } + + /** + * Resets the input for this content source, so that the test would behave as + * if it was just started, input-wise. + *

+ * NOTE: the default implementation resets the number of bytes and + * documents generated since the last reset, so it's important to call + * super.resetInputs in case you override this method. + */ + public void resetInputs() throws IOException { + bytesCount = 0; + docsCount = 0; + } + + /** + * Sets the {@link Config} for this content source. If you override this + * method, you must call super.setConfig. + */ + public void setConfig(Config config) { + this.config = config; + forever = config.get("content.source.forever", true); + logStep = config.get("content.source.log.step", 0); + verbose = config.get("content.source.verbose", false); + } + +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java index e81be63c0d6..f28d38a58f6 100755 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java @@ -30,14 +30,7 @@ import java.util.Properties; */ public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser { - public DemoHTMLParser () { - } - - /* - * (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.io.Reader, java.text.DateFormat) - */ - public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException { + public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException { org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader); // title @@ -64,16 +57,22 @@ public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds. date = new Date(); // now } } - - return new DocData(name, bodyBuf.toString(), title, props, date); + + docData.clear(); + docData.setName(name); + docData.setBody(bodyBuf.toString()); + docData.setTitle(title); + docData.setProps(props); + docData.setDate(date); + return docData; } /* * (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.lang.StringBuffer, java.text.DateFormat) */ - public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException { - return parse(name, date, new StringReader(inputText.toString()), dateFormat); + public DocData parse(DocData docData, String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException { + return parse(docData, name, date, new StringReader(inputText.toString()), dateFormat); } } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java similarity index 64% rename from contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java rename to contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java index cc542f94ccc..fb8bfb4d84d 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java @@ -23,7 +23,9 @@ import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileReader; +import java.io.IOException; import java.text.DateFormat; +import java.text.ParsePosition; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; @@ -31,31 +33,25 @@ import java.util.Locale; import java.util.Stack; /** - * A DocMaker using the Dir collection for its input. - * - * Config properties: - * docs.dir=<path to the docs dir| Default: dir-out> - - * + * A {@link ContentSource} using the Dir collection for its input. Supports + * the following configuration parameters (on top of {@link ContentSource}): + *

*/ -public class DirDocMaker extends BasicDocMaker { +public class DirContentSource extends ContentSource { - protected ThreadLocal dateFormat = new ThreadLocal(); - protected File dataDir = null; - protected int iteration=0; + private static final class DateFormatInfo { + DateFormat df; + ParsePosition pos; + } - static public class Iterator implements java.util.Iterator { - - int count = 0; - - public int getCount(){ - return count; - } - - Stack stack = new Stack(); - - /* this seems silly ... there must be a better way ... - not that this is good, but can it matter? */ + public static class Iterator implements java.util.Iterator { static class Comparator implements java.util.Comparator { public int compare(Object _a, Object _b) { @@ -81,22 +77,17 @@ public class DirDocMaker extends BasicDocMaker { } } + int count = 0; + + Stack stack = new Stack(); + + /* this seems silly ... there must be a better way ... + not that this is good, but can it matter? */ + Comparator c = new Comparator(); - void push(File[] files) { - Arrays.sort(files, c); - for(int i = 0; i < files.length; i++) { - // System.err.println("push " + files[i]); - stack.push(files[i]); - } - } - - void push(File f) { - push(f.listFiles(new FileFilter() { - public boolean accept(File f) { return f.isDirectory(); } })); - push(f.listFiles(new FileFilter() { - public boolean accept(File f) { return f.getName().endsWith(".txt"); } })); - find(); + public Iterator(File f) { + push(f); } void find() { @@ -110,18 +101,38 @@ public class DirDocMaker extends BasicDocMaker { push(f); } - public Iterator(File f) { - push(f); + void push(File f) { + push(f.listFiles(new FileFilter() { + + public boolean accept(File file) { + return file.isDirectory(); + } + })); + push(f.listFiles(new FileFilter() { + + public boolean accept(File file) { + return file.getName().endsWith(".txt"); + } + })); + find(); } - public void remove() { - throw new RuntimeException("cannot"); + void push(File[] files) { + Arrays.sort(files, c); + for(int i = 0; i < files.length; i++) { + // System.err.println("push " + files[i]); + stack.push(files[i]); + } } - + + public int getCount(){ + return count; + } + public boolean hasNext() { return stack.size() > 0; } - + public Object next() { assert hasNext(); count++; @@ -131,42 +142,44 @@ public class DirDocMaker extends BasicDocMaker { return object; } - } - - protected Iterator inputFiles = null; - - /* (non-Javadoc) - * @see SimpleDocMaker#setConfig(java.util.Properties) - */ - public void setConfig(Config config) { - super.setConfig(config); - String d = config.get("docs.dir", "dir-out"); - dataDir = new File(d); - if (!dataDir.isAbsolute()) { - dataDir = new File(new File("work"), d); + public void remove() { + throw new RuntimeException("cannot"); } - inputFiles = new Iterator(dataDir); - - if (inputFiles==null) { - throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); - } } + + private ThreadLocal dateFormat = new ThreadLocal(); + private File dataDir = null; + private int iteration = 0; + private Iterator inputFiles = null; // get/initiate a thread-local simple date format (must do so // because SimpleDateFormat is not thread-safe). - protected DateFormat getDateFormat () { - DateFormat df = (DateFormat) dateFormat.get(); - if (df == null) { + private DateFormatInfo getDateFormatInfo() { + DateFormatInfo dfi = (DateFormatInfo) dateFormat.get(); + if (dfi == null) { + dfi = new DateFormatInfo(); + dfi.pos = new ParsePosition(0); // date format: 30-MAR-1987 14:22:36.87 - df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US); - df.setLenient(true); - dateFormat.set(df); + dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS", Locale.US); + dfi.df.setLenient(true); + dateFormat.set(dfi); } - return df; + return dfi; } - protected DocData getNextDocData() throws Exception { + private Date parseDate(String dateStr) { + DateFormatInfo dfi = getDateFormatInfo(); + dfi.pos.setIndex(0); + dfi.pos.setErrorIndex(-1); + return dfi.df.parse(dateStr.trim(), dfi.pos); + } + + public void close() throws IOException { + inputFiles = null; + } + + public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { File f = null; String name = null; synchronized (this) { @@ -197,27 +210,37 @@ public class DirDocMaker extends BasicDocMaker { reader.close(); addBytes(f.length()); - Date date = getDateFormat().parse(dateStr.trim()); - return new DocData(name, bodyBuf.toString(), title, null, date); + Date date = parseDate(dateStr); + + docData.clear(); + docData.setName(name); + docData.setBody(bodyBuf.toString()); + docData.setTitle(title); + docData.setDate(date); + return docData; } - - - /* - * (non-Javadoc) - * @see DocMaker#resetIinputs() - */ - public synchronized void resetInputs() { + + public synchronized void resetInputs() throws IOException { super.resetInputs(); inputFiles = new Iterator(dataDir); iteration = 0; } - /* - * (non-Javadoc) - * @see DocMaker#numUniqueTexts() - */ - public int numUniqueTexts() { - return inputFiles.getCount(); + public void setConfig(Config config) { + super.setConfig(config); + + File workDir = new File(config.get("work.dir", "work")); + String d = config.get("docs.dir", "dir-out"); + dataDir = new File(d); + if (!dataDir.isAbsolute()) { + dataDir = new File(workDir, d); + } + + inputFiles = new Iterator(dataDir); + + if (inputFiles == null) { + throw new RuntimeException("No txt files in dataDir: " + dataDir.getAbsolutePath()); + } } } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java index a80008750b8..7d22e885c72 100755 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java @@ -20,94 +20,77 @@ package org.apache.lucene.benchmark.byTask.feeds; import java.util.Date; import java.util.Properties; -/** - * Output of parsing (e.g. HTML parsing) of an input document. - */ +import org.apache.lucene.document.DateTools; +/** Output of parsing (e.g. HTML parsing) of an input document. */ public class DocData { private String name; private String body; private String title; - private Date date; + private String date; private Properties props; - public DocData(String name, String body, String title, Properties props, Date date) { - this.name = name; - this.body = body; - this.title = title; - this.date = date; - this.props = props; + public void clear() { + name = null; + body = null; + title = null; + date = null; + props = null; } - - /** - * @return Returns the name. - */ - public String getName() { - return name; - } - - /** - * @param name The name to set. - */ - public void setName(String name) { - this.name = name; - } - - /** - * @return Returns the props. - */ - public Properties getProps() { - return props; - } - - /** - * @param props The props to set. - */ - public void setProps(Properties props) { - this.props = props; - } - - /** - * @return Returns the body. - */ + public String getBody() { return body; } /** - * @param body The body to set. + * @return the date. If the ctor with Date was called, then the String + * returned is the output of + * {@link DateTools#dateToString(Date, org.apache.lucene.document.DateTools.Resolution)} + * . Otherwise it's the String passed to the other ctor. */ - public void setBody(String body) { - this.body = body; + public String getDate() { + return date; + } + + public String getName() { + return name; + } + + public Properties getProps() { + return props; } - /** - * @return Returns the title. - */ public String getTitle() { return title; } - /** - * @param title The title to set. - */ + public void setBody(String body) { + this.body = body; + } + + public void setDate(Date date) { + if (date != null) { + setDate(DateTools.dateToString(date, DateTools.Resolution.SECOND)); + } else { + this.date = null; + } + } + + public void setDate(String date) { + this.date = date; + } + + public void setName(String name) { + this.name = name; + } + + public void setProps(Properties props) { + this.props = props; + } + public void setTitle(String title) { this.title = title; } - /** - * @return Returns the date. - */ - public Date getDate() { - return date; - } - - /** - * @param date The date to set. - */ - public void setDate(Date date) { - this.date = date; - } - } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java index c80be5d89c4..81a7242499b 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java @@ -17,55 +17,373 @@ package org.apache.lucene.benchmark.byTask.feeds; * limitations under the License. */ -import org.apache.lucene.document.Document; -import org.apache.lucene.benchmark.byTask.utils.Config; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Properties; +import java.util.Map.Entry; +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.Format; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; /** - * Create documents for the test. - *
Each call to makeDocument would create the next document. - * When input is exhausted, the DocMaker iterates over the input again, - * providing a source for unlimited number of documents, - * though not all of them are unique. + * Creates {@link Document} objects. Uses a {@link ContentSource} to generate + * {@link DocData} objects. Supports the following parameters: + * */ -public interface DocMaker { +public class DocMaker { - /** - * Create the next document, of the given size by input bytes. - * If the implementation does not support control over size, an exception is thrown. - * @param size size of document, or 0 if there is no size requirement. - * @exception if cannot make the document, or if size>0 was specified but this feature is not supported. + private static class LeftOver { + private DocData docdata; + private int cnt; + } + + static class DocState { + + private Map fields; + private boolean reuseFields; + Document doc; + DocData docData = new DocData(); + + public DocState(boolean reuseFields, Store store, Index index, TermVector termVector) { + + this.reuseFields = reuseFields; + + if (reuseFields) { + fields = new HashMap(); + + // Initialize the map with the default fields. + fields.put(BODY_FIELD, new Field(BODY_FIELD, "", store, index, termVector)); + fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", store, index, termVector)); + fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector)); + fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector)); + + doc = new Document(); + } + } + + /** + * Returns a field corresponding to the field name. If + * reuseFields was set to true, then it attempts to reuse a + * Field instance. If such a field does not exist, it creates a new one. + */ + Field getField(String name, Store store, Index index, TermVector termVector) { + if (!reuseFields) { + return new Field(name, "", store, index, termVector); + } + + Field f = (Field) fields.get(name); + if (f == null) { + f = new Field(name, "", store, index, termVector); + fields.put(name, f); + } + return f; + } + } + + private int numDocsCreated = 0; + private boolean storeBytes = false; + + // leftovers are thread local, because it is unsafe to share residues between threads + private ThreadLocal leftovr = new ThreadLocal(); + private ThreadLocal docState = new ThreadLocal(); + + public static final String BODY_FIELD = "body"; + public static final String TITLE_FIELD = "doctitle"; + public static final String DATE_FIELD = "docdate"; + public static final String ID_FIELD = "docid"; + public static final String BYTES_FIELD = "bytes"; + public static final String NAME_FIELD = "docname"; + + protected Config config; + + protected Store storeVal = Store.NO; + protected Index indexVal = Index.ANALYZED; + protected TermVector termVecVal = TermVector.NO; + + protected ContentSource source; + protected boolean reuseFields; + protected DocState localDocState; + + private int lastPrintedNumUniqueTexts = 0; + + private long lastPrintedNumUniqueBytes = 0; + + private int printNum = 0; + + // create a doc + // use only part of the body, modify it to keep the rest (or use all if size==0). + // reset the docdata properties so they are not added more than once. + private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException { + int docid = incrNumDocsCreated(); + DocState ds = reuseFields ? getDocState() : localDocState; + Document doc = reuseFields ? ds.doc : new Document(); + doc.getFields().clear(); + + // Set ID_FIELD + Field idField = ds.getField(ID_FIELD, storeVal, indexVal, termVecVal); + idField.setValue("doc" + docid); + doc.add(idField); + + // Set NAME_FIELD + String name = docData.getName(); + if (name == null) name = ""; + name = cnt < 0 ? name : name + "_" + cnt; + Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal); + nameField.setValue(name); + doc.add(nameField); + + // Set DATE_FIELD + String date = docData.getDate(); + if (date == null) { + date = ""; + } + Field dateField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal); + dateField.setValue(date); + doc.add(dateField); + + // Set TITLE_FIELD + String title = docData.getTitle(); + Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal); + titleField.setValue(title == null ? "" : title); + doc.add(titleField); + + String body = docData.getBody(); + if (body != null && body.length() > 0) { + String bdy; + if (size <= 0 || size >= body.length()) { + bdy = body; // use all + docData.setBody(""); // nothing left + } else { + // attempt not to break words - if whitespace found within next 20 chars... + for (int n = size - 1; n < size + 20 && n < body.length(); n++) { + if (Character.isWhitespace(body.charAt(n))) { + size = n; + break; + } + } + bdy = body.substring(0, size); // use part + docData.setBody(body.substring(size)); // some left + } + Field bodyField = ds.getField(BODY_FIELD, storeVal, indexVal, termVecVal); + bodyField.setValue(bdy); + doc.add(bodyField); + + if (storeBytes) { + Field bytesField = ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO); + bytesField.setValue(bdy.getBytes("UTF-8")); + doc.add(bytesField); + } + } + + Properties props = docData.getProps(); + if (props != null) { + for (Iterator iterator = props.entrySet().iterator(); iterator.hasNext();) { + Entry entry = (Entry) iterator.next(); + Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal); + f.setValue((String) entry.getValue()); + doc.add(f); + } + docData.setProps(null); + } + //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); + return doc; + } + + private void resetLeftovers() { + leftovr.set(null); + } + + protected DocState getDocState() { + DocState ds = (DocState) docState.get(); + if (ds == null) { + ds = new DocState(true, storeVal, indexVal, termVecVal); + docState.set(ds); + } + return ds; + } + + protected synchronized int incrNumDocsCreated() { + return numDocsCreated++; + } + + /** + * Closes the {@link DocMaker}. The base implementation closes the + * {@link ContentSource}, and it can be overridden to do more work (but make + * sure to call super.close()). + */ + public void close() throws IOException { + source.close(); + } + + /** + * Returns the number of bytes generated by the content source since last + * reset. + */ + public synchronized long getBytesCount() { + return source.getBytesCount(); + } + + /** + * Returns the total number of bytes that were generated by the content source + * defined to that doc maker. */ - public Document makeDocument (int size) throws Exception; + public long getTotalBytesCount() { + return source.getTotalBytesCount(); + } - /** Create the next document. */ - public Document makeDocument () throws Exception; + /** + * Creates a {@link Document} object ready for indexing. This method uses the + * {@link ContentSource} to get the next document from the source, and creates + * a {@link Document} object from the returned fields. If + * reuseFields was set to true, it will reuse {@link Document} + * and {@link Field} instances. + */ + public Document makeDocument() throws Exception { + resetLeftovers(); + DocData docData = source.getNextDocData(reuseFields ? getDocState().docData : localDocState.docData); + Document doc = createDocument(docData, 0, -1); + return doc; + } - /** Set the properties */ - public void setConfig (Config config); + /** + * Same as {@link #makeDocument()}, only this method creates a document of the + * given size input by size. + */ + public Document makeDocument(int size) throws Exception { + LeftOver lvr = (LeftOver) leftovr.get(); + if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null + || lvr.docdata.getBody().length() == 0) { + resetLeftovers(); + } + DocData docData = reuseFields ? getDocState().docData : localDocState.docData; + DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata); + int cnt = (lvr == null ? 0 : lvr.cnt); + while (dd.getBody() == null || dd.getBody().length() < size) { + DocData dd2 = dd; + dd = source.getNextDocData(new DocData()); + cnt = 0; + dd.setBody(dd2.getBody() + dd.getBody()); + } + Document doc = createDocument(dd, size, cnt); + if (dd.getBody() == null || dd.getBody().length() == 0) { + resetLeftovers(); + } else { + if (lvr == null) { + lvr = new LeftOver(); + leftovr.set(lvr); + } + lvr.docdata = dd; + lvr.cnt = ++cnt; + } + return doc; + } + + public void printDocStatistics() { + boolean print = false; + String col = " "; + StringBuffer sb = new StringBuffer(); + String newline = System.getProperty("line.separator"); + sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline); + int nut = source.getTotalDocsCount(); + if (nut > lastPrintedNumUniqueTexts) { + print = true; + sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline); + lastPrintedNumUniqueTexts = nut; + } + long nub = getTotalBytesCount(); + if (nub > lastPrintedNumUniqueBytes) { + print = true; + sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline); + lastPrintedNumUniqueBytes = nub; + } + if (source.getDocsCount() > 0) { + print = true; + sb.append("num docs added since last inputs reset: ").append(Format.format(0,source.getDocsCount(),col)).append(newline); + sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getBytesCount(),col)).append(newline); + } + if (print) { + System.out.println(sb.append(newline).toString()); + printNum++; + } + } /** Reset inputs so that the test run would behave, input wise, as if it just started. */ - public void resetInputs(); + public synchronized void resetInputs() throws IOException { + printDocStatistics(); + // re-initiate since properties by round may have changed. + setConfig(config); + source.resetInputs(); + numDocsCreated = 0; + resetLeftovers(); + } - /** Return how many real unique texts are available, 0 if not applicable. */ - public int numUniqueTexts(); - - /** Return total bytes of all available unique texts, 0 if not applicable */ - public long numUniqueBytes(); + /** Set the configuration parameters of this doc maker. */ + public void setConfig(Config config) { + this.config = config; + try { + String sourceClass = config.get("content.source", "org.apache.lucene.benchmark.byTask.feeds.SingleDocSource"); + source = (ContentSource) Class.forName(sourceClass).newInstance(); + source.setConfig(config); + } catch (Exception e) { + // Should not get here. Throw runtime exception. + throw new RuntimeException(e); + } - /** Return number of docs made since last reset. */ - public int getCount(); + boolean stored = config.get("doc.stored", false); + boolean tokenized = config.get("doc.tokenized", true); + boolean termVec = config.get("doc.term.vector", false); + storeVal = (stored ? Field.Store.YES : Field.Store.NO); + indexVal = (tokenized ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED); + boolean termVecPositions = config.get("doc.term.vector.positions", false); + boolean termVecOffsets = config.get("doc.term.vector.offsets", false); + if (termVecPositions && termVecOffsets) { + termVecVal = TermVector.WITH_POSITIONS_OFFSETS; + } else if (termVecPositions) { + termVecVal = TermVector.WITH_POSITIONS; + } else if (termVecOffsets) { + termVecVal = TermVector.WITH_OFFSETS; + } else if (termVec) { + termVecVal = TermVector.YES; + } else { + termVecVal = TermVector.NO; + } + storeBytes = config.get("doc.store.body.bytes", false); + + reuseFields = config.get("doc.reuse.fields", true); + if (!reuseFields) { + localDocState = new DocState(false, storeVal, indexVal, termVecVal); + } else { + // In a multi-rounds run, it is important to reset DocState since settings + // of fields may change between rounds, and this is the only way to reset + // the cache of all threads. + docState = new ThreadLocal(); + } + } - /** Return total byte size of docs made since last reset. */ - public long getByteCount(); - - /** Print some statistics on docs available/added/etc. */ - public void printDocStatistics(); - - /** Set the html parser to use, when appropriate */ - public void setHTMLParser(HTMLParser htmlParser); - - /** Returns the htmlParser. */ - public HTMLParser getHtmlParser(); - -} \ No newline at end of file +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java new file mode 100644 index 00000000000..ef1a53f9657 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java @@ -0,0 +1,294 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.helpers.XMLReaderFactory; + +/** + * A {@link ContentSource} which reads the English Wikipedia dump. You can read + * the .bz2 file directly (it will be decompressed on the fly). Config + * properties: + * + */ +public class EnwikiContentSource extends ContentSource { + + private class Parser extends DefaultHandler implements Runnable { + private Thread t; + private boolean threadDone; + private String[] tuple; + private NoMoreDataException nmde; + private StringBuffer contents = new StringBuffer(); + private String title; + private String body; + private String time; + private String id; + + String[] next() throws NoMoreDataException { + if (t == null) { + threadDone = false; + t = new Thread(this); + t.setDaemon(true); + t.start(); + } + String[] result; + synchronized(this){ + while(tuple == null && nmde == null && !threadDone) { + try { + wait(); + } catch (InterruptedException ie) { + } + } + if (nmde != null) { + // Set to null so we will re-start thread in case + // we are re-used: + t = null; + throw nmde; + } + if (t != null && threadDone) { + // The thread has exited yet did not hit end of + // data, so this means it hit an exception. We + // throw NoMorDataException here to force + // benchmark to stop the current alg: + throw new NoMoreDataException(); + } + result = tuple; + tuple = null; + notify(); + } + return result; + } + + String time(String original) { + StringBuffer buffer = new StringBuffer(); + + buffer.append(original.substring(8, 10)); + buffer.append('-'); + buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]); + buffer.append('-'); + buffer.append(original.substring(0, 4)); + buffer.append(' '); + buffer.append(original.substring(11, 19)); + buffer.append(".000"); + + return buffer.toString(); + } + + public void characters(char[] ch, int start, int length) { + contents.append(ch, start, length); + } + + public void endElement(String namespace, String simple, String qualified) + throws SAXException { + int elemType = getElementType(qualified); + switch (elemType) { + case PAGE: + // the body must be null and we either are keeping image docs or the + // title does not start with Image: + if (body != null && (keepImages || !title.startsWith("Image:"))) { + String[] tmpTuple = new String[LENGTH]; + tmpTuple[TITLE] = title.replace('\t', ' '); + tmpTuple[DATE] = time.replace('\t', ' '); + tmpTuple[BODY] = body.replaceAll("[\t\n]", " "); + tmpTuple[ID] = id; + synchronized(this) { + while (tuple != null) { + try { + wait(); + } catch (InterruptedException ie) { + } + } + tuple = tmpTuple; + notify(); + } + } + break; + case BODY: + body = contents.toString(); + //workaround that startswith doesn't have an ignore case option, get at least 20 chars. + String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); + if (startsWith.startsWith("#redirect")) { + body = null; + } + break; + case DATE: + time = time(contents.toString()); + break; + case TITLE: + title = contents.toString(); + break; + case ID: + id = contents.toString(); + break; + default: + // this element should be discarded. + } + } + + public void run() { + + try { + XMLReader reader = XMLReaderFactory.createXMLReader(); + reader.setContentHandler(this); + reader.setErrorHandler(this); + while(true){ + final InputStream localFileIS = is; + try { + reader.parse(new InputSource(localFileIS)); + } catch (IOException ioe) { + synchronized(EnwikiContentSource.this) { + if (localFileIS != is) { + // fileIS was closed on us, so, just fall + // through + } else + // Exception is real + throw ioe; + } + } + synchronized(this) { + if (!forever) { + nmde = new NoMoreDataException(); + notify(); + return; + } else if (localFileIS == is) { + // If file is not already re-opened then re-open it now + is = getInputStream(file); + } + } + } + } catch (SAXException sae) { + throw new RuntimeException(sae); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } finally { + synchronized(this) { + threadDone = true; + notify(); + } + } + } + + public void startElement(String namespace, String simple, String qualified, + Attributes attributes) { + int elemType = getElementType(qualified); + switch (elemType) { + case PAGE: + title = null; + body = null; + time = null; + id = null; + break; + // intentional fall-through. + case BODY: + case DATE: + case TITLE: + case ID: + contents.setLength(0); + break; + default: + // this element should be discarded. + } + } + } + + private static final Map ELEMENTS = new HashMap(); + private static final int TITLE = 0; + private static final int DATE = TITLE + 1; + private static final int BODY = DATE + 1; + private static final int ID = BODY + 1; + private static final int LENGTH = ID + 1; + // LENGTH is used as the size of the tuple, so whatever constants we need that + // should not be part of the tuple, we should define them after LENGTH. + private static final int PAGE = LENGTH + 1; + + private static final String[] months = {"JAN", "FEB", "MAR", "APR", + "MAY", "JUN", "JUL", "AUG", + "SEP", "OCT", "NOV", "DEC"}; + + static { + ELEMENTS.put("page", Integer.valueOf(PAGE)); + ELEMENTS.put("text", Integer.valueOf(BODY)); + ELEMENTS.put("timestamp", Integer.valueOf(DATE)); + ELEMENTS.put("title", Integer.valueOf(TITLE)); + ELEMENTS.put("id", Integer.valueOf(ID)); + } + + /** + * Returns the type of the element if defined, otherwise returns -1. This + * method is useful in startElement and endElement, by not needing to compare + * the element qualified name over and over. + */ + private final static int getElementType(String elem) { + Integer val = (Integer) ELEMENTS.get(elem); + return val == null ? -1 : val.intValue(); + } + + private File file; + private boolean keepImages = true; + private InputStream is; + private Parser parser = new Parser(); + + public void close() throws IOException { + synchronized (EnwikiContentSource.this) { + if (is != null) { + is.close(); + is = null; + } + } + } + + public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { + String[] tuple = parser.next(); + docData.clear(); + docData.setName(tuple[ID]); + docData.setBody(tuple[BODY]); + docData.setDate(tuple[DATE]); + docData.setTitle(tuple[TITLE]); + return docData; + } + + public void resetInputs() throws IOException { + super.resetInputs(); + is = getInputStream(file); + } + + public void setConfig(Config config) { + super.setConfig(config); + keepImages = config.get("keep.image.only.docs", true); + String fileName = config.get("docs.file", null); + if (fileName == null) { + throw new IllegalArgumentException("docs.file must be set"); + } + file = new File(fileName).getAbsoluteFile(); + } + +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java index 64c11e3b357..e608a4fdc67 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java @@ -17,288 +17,54 @@ package org.apache.lucene.benchmark.byTask.feeds; * limitations under the License. */ -import java.io.IOException; -import java.io.InputStream; -import java.util.HashMap; -import java.util.Map; - import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; -import org.xml.sax.Attributes; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; -import org.xml.sax.helpers.XMLReaderFactory; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; /** - * A {@link LineDocMaker} which reads the english wikipedia - * dump. You can read the .bz2 file directly (it will be - * decompressed on the fly). - * Config properties: - * - * - * @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker + * A {@link DocMaker} which reads the English Wikipedia dump. Uses + * {@link EnwikiContentSource} as its content source, regardless if a different + * content source was defined in the configuration. */ -public class EnwikiDocMaker extends LineDocMaker { +public class EnwikiDocMaker extends DocMaker { - private static final Map ELEMENTS = new HashMap(); - - static final int TITLE = 0; - static final int DATE = TITLE + 1; - static final int BODY = DATE + 1; - static final int ID = BODY + 1; - static final int LENGTH = ID + 1; - // LENGTH is used as the size of the tuple, so whatever constants we need that - // should not be part of the tuple, we should define them after LENGTH. - static final int PAGE = LENGTH + 1; - - static final String[] months = {"JAN", "FEB", "MAR", "APR", - "MAY", "JUN", "JUL", "AUG", - "SEP", "OCT", "NOV", "DEC"}; + public Document makeDocument() throws Exception { + DocState ds = reuseFields ? getDocState() : localDocState; + DocData dd = source.getNextDocData(ds.docData); + Document doc = reuseFields ? ds.doc : new Document(); + doc.getFields().clear(); - static { - ELEMENTS.put("page", new Integer(PAGE)); - ELEMENTS.put("text", new Integer(BODY)); - ELEMENTS.put("timestamp", new Integer(DATE)); - ELEMENTS.put("title", new Integer(TITLE)); - ELEMENTS.put("id", new Integer(ID)); + Field body = ds.getField(BODY_FIELD, storeVal, Index.ANALYZED, termVecVal); + body.setValue(dd.getBody()); + doc.add(body); + + Field title = ds.getField(TITLE_FIELD, storeVal, Index.ANALYZED, termVecVal); + title.setValue(dd.getTitle()); + doc.add(title); + + Field date = ds.getField(DATE_FIELD, storeVal, Index.ANALYZED, termVecVal); + date.setValue(dd.getDate()); + doc.add(date); + + Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO); + id.setValue(dd.getName()); + doc.add(id); + + return doc; } - - /** - * Returns the type of the element if defined, otherwise returns -1. This - * method is useful in startElement and endElement, by not needing to compare - * the element qualified name over and over. - */ - private final static int getElementType(String elem) { - Integer val = (Integer) ELEMENTS.get(elem); - return val == null ? -1 : val.intValue(); + + public Document makeDocument(int size) throws Exception { + throw new RuntimeException("cannot change document size with EnwikiDocMaker"); } - - protected boolean keepImages = true; public void setConfig(Config config) { super.setConfig(config); - keepImages = config.get("keep.image.only.docs", true); + // Override whatever content source was set in the config + source = new EnwikiContentSource(); + source.setConfig(config); } - - class Parser extends DefaultHandler implements Runnable { - Thread t; - boolean threadDone; - - public void run() { - - try { - XMLReader reader = - XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); - reader.setContentHandler(this); - reader.setErrorHandler(this); - while(true){ - final InputStream localFileIS = fileIS; - try { - InputSource is = new InputSource(localFileIS); - reader.parse(is); - } catch (IOException ioe) { - synchronized(EnwikiDocMaker.this) { - if (localFileIS != fileIS) { - // fileIS was closed on us, so, just fall - // through - } else - // Exception is real - throw ioe; - } - } - synchronized(this) { - if (!forever) { - nmde = new NoMoreDataException(); - notify(); - return; - } else if (localFileIS == fileIS) { - // If file is not already re-opened then - // re-open it now - openFile(); - } - } - } - } catch (SAXException sae) { - throw new RuntimeException(sae); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } finally { - synchronized(this) { - threadDone = true; - notify(); - } - } - } - - String[] tuple; - NoMoreDataException nmde; - - String[] next() throws NoMoreDataException { - if (t == null) { - threadDone = false; - t = new Thread(this); - t.setDaemon(true); - t.start(); - } - String[] result; - synchronized(this){ - while(tuple == null && nmde == null && !threadDone) { - try { - wait(); - } catch (InterruptedException ie) { - } - } - if (nmde != null) { - // Set to null so we will re-start thread in case - // we are re-used: - t = null; - throw nmde; - } - if (t != null && threadDone) { - // The thread has exited yet did not hit end of - // data, so this means it hit an exception. We - // throw NoMorDataException here to force - // benchmark to stop the current alg: - throw new NoMoreDataException(); - } - result = tuple; - tuple = null; - notify(); - } - return result; - } - - StringBuffer contents = new StringBuffer(); - - public void characters(char[] ch, int start, int length) { - contents.append(ch, start, length); - } - - String title; - String body; - String time; - String id; - - public void startElement(String namespace, - String simple, - String qualified, - Attributes attributes) { - int elemType = getElementType(qualified); - switch (elemType) { - case PAGE: - title = null; - body = null; - time = null; - id = null; - break; - // intentional fall-through. - case BODY: - case DATE: - case TITLE: - case ID: - contents.setLength(0); - break; - default: - // this element should be discarded. - } - } - - String time(String original) { - StringBuffer buffer = new StringBuffer(); - - buffer.append(original.substring(8, 10)); - buffer.append('-'); - buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]); - buffer.append('-'); - buffer.append(original.substring(0, 4)); - buffer.append(' '); - buffer.append(original.substring(11, 19)); - buffer.append(".000"); - - return buffer.toString(); - } - - public void create(String title, String time, String body, String id) { - String[] t = new String[LENGTH]; - t[TITLE] = title.replace('\t', ' '); - t[DATE] = time.replace('\t', ' '); - t[BODY] = body.replaceAll("[\t\n]", " "); - t[ID] = id; - synchronized(this) { - while(tuple!=null) { - try { - wait(); - } catch (InterruptedException ie) { - } - } - tuple = t; - notify(); - } - } - - public void endElement(String namespace, String simple, String qualified) - throws SAXException { - int elemType = getElementType(qualified); - switch (elemType) { - case PAGE: - // the body must be null and we either are keeping image docs or the - // title does not start with Image: - if (body != null && (keepImages || !title.startsWith("Image:"))) { - create(title, time, body, id); - } - break; - case BODY: - body = contents.toString(); - //workaround that startswith doesn't have an ignore case option, get at least 20 chars. - String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(); - if (startsWith.startsWith("#redirect")) { - body = null; - } - break; - case DATE: - time = time(contents.toString()); - break; - case TITLE: - title = contents.toString(); - break; - case ID: - id = contents.toString(); - break; - default: - // this element should be discarded. - } - } - } - - Parser parser = new Parser(); - - class DocState extends LineDocMaker.DocState { - public Document setFields(String[] tuple) { - titleField.setValue(tuple[TITLE]); - dateField.setValue(tuple[DATE]); - bodyField.setValue(tuple[BODY]); - idField.setValue(tuple[ID]); - return doc; - } - } - - private DocState getDocState() { - DocState ds = (DocState) docState.get(); - if (ds == null) { - ds = new DocState(); - docState.set(ds); - } - return ds; - } - - public Document makeDocument() throws Exception { - String[] tuple = parser.next(); - return getDocState().setFields(tuple); - } - + } \ No newline at end of file diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java index b357f676760..66f62df12de 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java @@ -46,7 +46,7 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake Analyzer anlzr = (Analyzer) Class.forName(config.get("analyzer", "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance(); - String defaultField = config.get("file.query.maker.default.field", BasicDocMaker.BODY_FIELD); + String defaultField = config.get("file.query.maker.default.field", DocMaker.BODY_FIELD); QueryParser qp = new QueryParser(defaultField, anlzr); List qq = new ArrayList(); @@ -55,8 +55,7 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake { File file = new File(fileName); Reader reader = null; - if (file != null && file.exists()) - { + if (file.exists()) { reader = new FileReader(file); } else { //see if we can find it as a resource diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java index cf276ab6dcf..0155324b6bd 100755 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java @@ -39,13 +39,13 @@ public interface HTMLParser { * @throws IOException * @throws InterruptedException */ - public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException; + public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException; /** * Parse the inputText and return DocData. * @param inputText the html text to parse. * @see #parse(String, Date, Reader, DateFormat) */ - public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException; + public DocData parse(DocData docData, String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException; } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java index b37dba130ff..a618ea83428 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java @@ -17,246 +17,76 @@ package org.apache.lucene.benchmark.byTask.feeds; * limitations under the License. */ -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.util.Random; -import org.apache.commons.compress.compressors.CompressorException; -import org.apache.commons.compress.compressors.CompressorStreamFactory; -import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; /** * A DocMaker reading one line at a time as a Document from a single file. This - * saves IO cost (over DirDocMaker) of recursing through a directory and opening - * a new file for every document. It also re-uses its Document and Field + * saves IO cost (over DirContentSource) of recursing through a directory and + * opening a new file for every document. It also re-uses its Document and Field * instance to improve indexing speed.
* The expected format of each line is (arguments are separated by <TAB>): * title, date, body. If a line is read in a different format, a * {@link RuntimeException} will be thrown. In general, you should use this doc - * maker with files that were created with {@link WriteLineDocTask}.

- * + * maker with files that were created with {@link WriteLineDocTask}.
+ *
* Config properties: * */ -public class LineDocMaker extends BasicDocMaker { +public class LineDocMaker extends DocMaker { - InputStream fileIS; - BufferedReader fileIn; - ThreadLocal docState = new ThreadLocal(); - private String fileName; - - private static int READER_BUFFER_BYTES = 64*1024; - private final DocState localDocState = new DocState(); - - private boolean doReuseFields = true; - private boolean bzipCompressionEnabled = false; private Random r; private int numDocs; - - private CompressorStreamFactory csFactory = new CompressorStreamFactory(); - - class DocState { - Document doc; - Field bodyField; - Field titleField; - Field dateField; - Field idField; - - public DocState() { - - bodyField = new Field(BasicDocMaker.BODY_FIELD, - "", - storeVal, - Field.Index.ANALYZED, - termVecVal); - titleField = new Field(BasicDocMaker.TITLE_FIELD, - "", - storeVal, - Field.Index.ANALYZED, - termVecVal); - dateField = new Field(BasicDocMaker.DATE_FIELD, - "", - storeVal, - Field.Index.ANALYZED, - termVecVal); - idField = new Field(BasicDocMaker.ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); - - doc = new Document(); - doc.add(bodyField); - doc.add(titleField); - doc.add(dateField); - doc.add(idField); - } - - final static char SEP = WriteLineDocTask.SEP; - - private int numDocsCreated; - private synchronized int incrNumDocsCreated() { - return numDocsCreated++; - } - - public Document setFields(String line) { - // A line must be in the following format. If it's not, fail ! - // title date body - int spot = line.indexOf(SEP); - if (spot == -1) { - throw new RuntimeException("line: [" + line + "] is in an invalid format !"); - } - int spot2 = line.indexOf(SEP, 1 + spot); - if (spot2 == -1) { - throw new RuntimeException("line: [" + line + "] is in an invalid format !"); - } - final String title = line.substring(0, spot); - final String date = line.substring(1+spot, spot2); - final String body = line.substring(1+spot2, line.length()); - final String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated()); - - if (doReuseFields) { - idField.setValue(docID); - titleField.setValue(title); - dateField.setValue(date); - bodyField.setValue(body); - return doc; - } else { - Field localIDField = new Field(BasicDocMaker.ID_FIELD, - docID, - Field.Store.YES, - Field.Index.NOT_ANALYZED_NO_NORMS); - - Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD, - title, - storeVal, - Field.Index.ANALYZED, - termVecVal); - Field localBodyField = new Field(BasicDocMaker.BODY_FIELD, - body, - storeVal, - Field.Index.ANALYZED, - termVecVal); - Field localDateField = new Field(BasicDocMaker.BODY_FIELD, - date, - storeVal, - Field.Index.ANALYZED, - termVecVal); - Document localDoc = new Document(); - localDoc.add(localIDField); - localDoc.add(localBodyField); - localDoc.add(localTitleField); - localDoc.add(localDateField); - return localDoc; - } - } - } - - protected DocData getNextDocData() throws Exception { - throw new RuntimeException("not implemented"); - } - - private DocState getDocState() { - DocState ds = (DocState) docState.get(); - if (ds == null) { - ds = new DocState(); - docState.set(ds); - } - return ds; - } public Document makeDocument() throws Exception { - String line; - synchronized(this) { - line = fileIn.readLine(); - if (line == null) { - if (!forever) { - throw new NoMoreDataException(); - } - // Reset the file - openFile(); - return makeDocument(); - } - } + DocState ds = reuseFields ? getDocState() : localDocState; + DocData dd = source.getNextDocData(ds.docData); + Document doc = reuseFields ? ds.doc : new Document(); + doc.getFields().clear(); - if (doReuseFields) - return getDocState().setFields(line); - else - return localDocState.setFields(line); + Field body = ds.getField(BODY_FIELD, storeVal, Index.ANALYZED, termVecVal); + body.setValue(dd.getBody()); + doc.add(body); + + Field title = ds.getField(TITLE_FIELD, storeVal, Index.ANALYZED, termVecVal); + title.setValue(dd.getTitle()); + doc.add(title); + + Field date = ds.getField(DATE_FIELD, storeVal, Index.ANALYZED, termVecVal); + date.setValue(dd.getDate()); + doc.add(date); + + String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated()); + Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO); + id.setValue(docID); + doc.add(id); + + return doc; } public Document makeDocument(int size) throws Exception { - throw new RuntimeException("cannot change document size with LineDocMaker; please use DirDocMaker instead"); + throw new RuntimeException("cannot change document size with LineDocMaker"); } - public synchronized void resetInputs() { - super.resetInputs(); - openFile(); - } - public void setConfig(Config config) { super.setConfig(config); - fileName = config.get("docs.file", null); - if (fileName == null) { - throw new IllegalArgumentException("docs.file must be set"); - } - doReuseFields = config.get("doc.reuse.fields", true); - String doBZCompress = config.get("bzip.compression", null); - if (doBZCompress != null) { - // Property was set, use the value. - bzipCompressionEnabled = Boolean.valueOf(doBZCompress).booleanValue(); - } else { - // Property was not set, attempt to detect based on file's extension - bzipCompressionEnabled = fileName.endsWith("bz2"); - } + source = new LineDocSource(); + source.setConfig(config); numDocs = config.get("doc.random.id.limit", -1); if (numDocs != -1) { r = new Random(179); } } - synchronized void openFile() { - try { - if (fileIn != null) { - fileIn.close(); - } - fileIS = new FileInputStream(fileName); - if (bzipCompressionEnabled) { - // According to BZip2CompressorInputStream's code, it reads the first - // two file header chars ('B' and 'Z'). We only need to wrap the - // underlying stream with a BufferedInputStream, since the code uses - // the read() method exclusively. - fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES); - fileIS = csFactory.createCompressorInputStream("bzip2", fileIS); - } - // Wrap the stream with a BufferedReader for several reasons: - // 1. We need the readLine() method. - // 2. Even if bzip.compression is enabled, and is wrapped with - // BufferedInputStream, wrapping with a buffer can still improve - // performance, since the BIS buffer will be used to read from the - // compressed stream, while the BR buffer will be used to read from the - // uncompressed stream. - fileIn = new BufferedReader(new InputStreamReader(fileIS, "UTF-8"), READER_BUFFER_BYTES); - } catch (IOException e) { - throw new RuntimeException(e); - } catch (CompressorException e) { - throw new RuntimeException(e); - } - } - - public int numUniqueTexts() { - return -1; - } - } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java new file mode 100644 index 00000000000..620525fa11f --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java @@ -0,0 +1,116 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + +import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; +import org.apache.lucene.benchmark.byTask.utils.Config; + +/** + * A {@link ContentSource} reading one line at a time as a + * {@link org.apache.lucene.document.Document} from a single file. This saves IO + * cost (over DirContentSource) of recursing through a directory and opening a + * new file for every document.
+ * The expected format of each line is (arguments are separated by <TAB>): + * title, date, body. If a line is read in a different format, a + * {@link RuntimeException} will be thrown. In general, you should use this + * content source for files that were created with {@link WriteLineDocTask}.
+ *
+ * Config properties: + *
    + *
  • docs.file=<path to the file> + *
+ */ +public class LineDocSource extends ContentSource { + + private final static char SEP = WriteLineDocTask.SEP; + + private File file; + private BufferedReader reader; + + private synchronized void openFile() { + try { + if (reader != null) { + reader.close(); + } + InputStream is = getInputStream(file); + reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void close() throws IOException { + if (reader != null) { + reader.close(); + reader = null; + } + } + + public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { + String line; + synchronized(this) { + line = reader.readLine(); + if (line == null) { + if (!forever) { + throw new NoMoreDataException(); + } + // Reset the file + openFile(); + return getNextDocData(docData); + } + } + + // A line must be in the following format. If it's not, fail ! + // title date body + int spot = line.indexOf(SEP); + if (spot == -1) { + throw new RuntimeException("line: [" + line + "] is in an invalid format !"); + } + int spot2 = line.indexOf(SEP, 1 + spot); + if (spot2 == -1) { + throw new RuntimeException("line: [" + line + "] is in an invalid format !"); + } + // The date String was written in the format of DateTools.dateToString. + docData.clear(); + docData.setBody(line.substring(1 + spot2, line.length())); + docData.setTitle(line.substring(0, spot)); + docData.setDate(line.substring(1 + spot, spot2)); + return docData; + } + + public void resetInputs() throws IOException { + super.resetInputs(); + openFile(); + } + + public void setConfig(Config config) { + super.setConfig(config); + String fileName = config.get("docs.file", null); + if (fileName == null) { + throw new IllegalArgumentException("docs.file must be set"); + } + file = new File(fileName).getAbsoluteFile(); + } + +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java new file mode 100644 index 00000000000..92d3bc53195 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java @@ -0,0 +1,147 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.text.DateFormat; +import java.text.ParsePosition; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.Locale; + +import org.apache.lucene.benchmark.byTask.utils.Config; + +/** + * A {@link ContentSource} reading from the Reuters collection. + *

+ * Config properties: + *

    + *
  • work.dir - path to the root of docs and indexes dirs (default + * work). + *
  • docs.dir - path to the docs dir (default reuters-out). + *
+ */ +public class ReutersContentSource extends ContentSource { + + private static final class DateFormatInfo { + DateFormat df; + ParsePosition pos; + } + + private ThreadLocal dateFormat = new ThreadLocal(); + private File dataDir = null; + private ArrayList inputFiles = new ArrayList(); + private int nextFile = 0; + private int iteration = 0; + + public void setConfig(Config config) { + super.setConfig(config); + File workDir = new File(config.get("work.dir", "work")); + String d = config.get("docs.dir", "reuters-out"); + dataDir = new File(d); + if (!dataDir.isAbsolute()) { + dataDir = new File(workDir, d); + } + inputFiles.clear(); + collectFiles(dataDir, inputFiles); + if (inputFiles.size() == 0) { + throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); + } + } + + private synchronized DateFormatInfo getDateFormatInfo() { + DateFormatInfo dfi = (DateFormatInfo) dateFormat.get(); + if (dfi == null) { + dfi = new DateFormatInfo(); + // date format: 30-MAR-1987 14:22:36.87 + dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US); + dfi.df.setLenient(true); + dfi.pos = new ParsePosition(0); + dateFormat.set(dfi); + } + return dfi; + } + + private Date parseDate(String dateStr) { + DateFormatInfo dfi = getDateFormatInfo(); + dfi.pos.setIndex(0); + dfi.pos.setErrorIndex(-1); + return dfi.df.parse(dateStr.trim(), dfi.pos); + } + + + public void close() throws IOException { + // TODO implement? + } + + public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { + File f = null; + String name = null; + synchronized (this) { + if (nextFile >= inputFiles.size()) { + // exhausted files, start a new round, unless forever set to false. + if (!forever) { + throw new NoMoreDataException(); + } + nextFile = 0; + iteration++; + } + f = (File) inputFiles.get(nextFile++); + name = f.getCanonicalPath() + "_" + iteration; + } + + BufferedReader reader = new BufferedReader(new FileReader(f)); + try { + // First line is the date, 3rd is the title, rest is body + String dateStr = reader.readLine(); + reader.readLine();// skip an empty line + String title = reader.readLine(); + reader.readLine();// skip an empty line + StringBuffer bodyBuf = new StringBuffer(1024); + String line = null; + while ((line = reader.readLine()) != null) { + bodyBuf.append(line).append(' '); + } + reader.close(); + + addBytes(f.length()); + + Date date = parseDate(dateStr.trim()); + + docData.clear(); + docData.setName(name); + docData.setBody(bodyBuf.toString()); + docData.setTitle(title); + docData.setDate(date); + return docData; + } finally { + reader.close(); + } + } + + public synchronized void resetInputs() throws IOException { + super.resetInputs(); + nextFile = 0; + iteration = 0; + } + +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java deleted file mode 100644 index 4e13ddd6e8c..00000000000 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java +++ /dev/null @@ -1,135 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.benchmark.byTask.utils.Config; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.Locale; - - -/** - * A DocMaker using the Reuters collection for its input. - *

- * Config properties:

    - *
  • work.dir=<path to the root of docs and indexes dirs| Default: work>
  • - *
  • docs.dir=<path to the docs dir| Default: reuters-out>
  • - *
- */ -public class ReutersDocMaker extends BasicDocMaker { - - private ThreadLocal dateFormat = new ThreadLocal(); - private File dataDir = null; - private ArrayList inputFiles = new ArrayList(); - private int nextFile = 0; - private int iteration=0; - - /* (non-Javadoc) - * @see SimpleDocMaker#setConfig(java.util.Properties) - */ - public void setConfig(Config config) { - super.setConfig(config); - File workDir = new File(config.get("work.dir","work")); - String d = config.get("docs.dir","reuters-out"); - dataDir = new File(d); - if (!dataDir.isAbsolute()) { - dataDir = new File(workDir, d); - } - resetUniqueBytes(); - inputFiles.clear(); - collectFiles(dataDir,inputFiles); - if (inputFiles.size()==0) { - throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); - } - } - - // get/initiate a thread-local simple date format (must do so - // because SimpleDateFormat is not thread-safe. - protected synchronized DateFormat getDateFormat () { - DateFormat df = (DateFormat) dateFormat.get(); - if (df == null) { - // date format: 30-MAR-1987 14:22:36.87 - df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US); - df.setLenient(true); - dateFormat.set(df); - } - return df; - } - - protected DocData getNextDocData() throws Exception { - File f = null; - String name = null; - synchronized (this) { - if (nextFile >= inputFiles.size()) { - // exhausted files, start a new round, unless forever set to false. - if (!forever) { - throw new NoMoreDataException(); - } - nextFile = 0; - iteration++; - } - f = (File) inputFiles.get(nextFile++); - name = f.getCanonicalPath()+"_"+iteration; - } - - BufferedReader reader = new BufferedReader(new FileReader(f)); - String line = null; - //First line is the date, 3rd is the title, rest is body - String dateStr = reader.readLine(); - reader.readLine();//skip an empty line - String title = reader.readLine(); - reader.readLine();//skip an empty line - StringBuffer bodyBuf = new StringBuffer(1024); - while ((line = reader.readLine()) != null) { - bodyBuf.append(line).append(' '); - } - reader.close(); - - addBytes(f.length()); - - - Date date = getDateFormat().parse(dateStr.trim()); - return new DocData(name, bodyBuf.toString(), title, null, date); - } - - - /* - * (non-Javadoc) - * @see DocMaker#resetIinputs() - */ - public synchronized void resetInputs() { - super.resetInputs(); - nextFile = 0; - iteration = 0; - } - - /* - * (non-Javadoc) - * @see DocMaker#numUniqueTexts() - */ - public int numUniqueTexts() { - return inputFiles.size(); - } - -} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java index 0b0081b4e44..9591ebb6b86 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java @@ -71,7 +71,7 @@ public class ReutersQueryMaker extends AbstractQueryMaker implements QueryMaker * @return array of Lucene queries */ private static Query[] createQueries(List qs, Analyzer a) { - QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD, a); + QueryParser qp = new QueryParser(DocMaker.BODY_FIELD, a); List queries = new ArrayList(); for (int i = 0; i < qs.size(); i++) { try { @@ -107,7 +107,7 @@ public class ReutersQueryMaker extends AbstractQueryMaker implements QueryMaker List queryList = new ArrayList(20); queryList.addAll(Arrays.asList(STANDARD_QUERIES)); - queryList.addAll(Arrays.asList(getPrebuiltQueries(BasicDocMaker.BODY_FIELD))); + queryList.addAll(Arrays.asList(getPrebuiltQueries(DocMaker.BODY_FIELD))); return createQueries(queryList, anlzr); } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java index 638fbd01438..d440a712514 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java @@ -29,7 +29,7 @@ import java.util.ArrayList; /** * A QueryMaker that makes queries for a collection created - * using {@link org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker}. + * using {@link org.apache.lucene.benchmark.byTask.feeds.SingleDocSource}. */ public class SimpleQueryMaker extends AbstractQueryMaker implements QueryMaker { @@ -45,11 +45,11 @@ public class SimpleQueryMaker extends AbstractQueryMaker implements QueryMaker { Analyzer anlzr= (Analyzer) Class.forName(config.get("analyzer", "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance(); - QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD,anlzr); + QueryParser qp = new QueryParser(DocMaker.BODY_FIELD,anlzr); ArrayList qq = new ArrayList(); - Query q1 = new TermQuery(new Term(BasicDocMaker.ID_FIELD,"doc2")); + Query q1 = new TermQuery(new Term(DocMaker.ID_FIELD,"doc2")); qq.add(q1); - Query q2 = new TermQuery(new Term(BasicDocMaker.BODY_FIELD,"simple")); + Query q2 = new TermQuery(new Term(DocMaker.BODY_FIELD,"simple")); qq.add(q2); BooleanQuery bq = new BooleanQuery(); bq.add(q1,Occur.MUST); diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleSloppyPhraseQueryMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleSloppyPhraseQueryMaker.java index f16f49e82a6..73feda962b5 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleSloppyPhraseQueryMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleSloppyPhraseQueryMaker.java @@ -36,7 +36,7 @@ public class SimpleSloppyPhraseQueryMaker extends SimpleQueryMaker { // exatract some 100 words from doc text to an array String words[]; ArrayList w = new ArrayList(); - StringTokenizer st = new StringTokenizer(SimpleDocMaker.DOC_TEXT); + StringTokenizer st = new StringTokenizer(SingleDocSource.DOC_TEXT); while (st.hasMoreTokens() && w.size()<100) { w.add(st.nextToken()); } @@ -53,7 +53,7 @@ public class SimpleSloppyPhraseQueryMaker extends SimpleQueryMaker { q.setSlop(slop); int wind = wd; for (int i=0; i0) { remainedSlop--; wind++; @@ -66,7 +66,7 @@ public class SimpleSloppyPhraseQueryMaker extends SimpleQueryMaker { q.setSlop(slop+2*qlen); wind = wd+qlen+remainedSlop-1; for (int i=0; i0) { remainedSlop--; wind--; diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SingleDocSource.java similarity index 81% rename from contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java rename to contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SingleDocSource.java index 8fe74e7e2d6..78189ccea86 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SingleDocSource.java @@ -1,5 +1,7 @@ package org.apache.lucene.benchmark.byTask.feeds; +import java.io.IOException; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -18,9 +20,9 @@ package org.apache.lucene.benchmark.byTask.feeds; */ /** - * Create documents for the test. + * Creates the same document each time {@link #getNextDocData()} is called. */ -public class SimpleDocMaker extends BasicDocMaker { +public class SingleDocSource extends ContentSource { private int docID = 0; @@ -42,33 +44,26 @@ public class SimpleDocMaker extends BasicDocMaker { // return a new docid private synchronized int newdocid() throws NoMoreDataException { - if (docID>0 && !forever) { + if (docID > 0 && !forever) { throw new NoMoreDataException(); } return docID++; } - /* - * (non-Javadoc) - * @see DocMaker#resetIinputs() - */ - public synchronized void resetInputs() { + public void close() throws IOException {} + + public DocData getNextDocData(DocData docData) throws NoMoreDataException { + int id = newdocid(); + addBytes(DOC_TEXT.length()); + docData.clear(); + docData.setName("doc" + id); + docData.setBody(DOC_TEXT); + return docData; + } + + public synchronized void resetInputs() throws IOException { super.resetInputs(); docID = 0; } - /* - * (non-Javadoc) - * @see DocMaker#numUniqueTexts() - */ - public int numUniqueTexts() { - return 0; // not applicable - } - - protected DocData getNextDocData() throws NoMoreDataException { - int id = newdocid(); - addBytes(DOC_TEXT.length()); - return new DocData("doc"+id, DOC_TEXT, null, null, null); - } - } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSimpleDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSimpleDocMaker.java deleted file mode 100644 index f0442b4d800..00000000000 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SortableSimpleDocMaker.java +++ /dev/null @@ -1,56 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds; - -import java.util.Properties; -import java.util.Random; - -import org.apache.lucene.benchmark.byTask.utils.Config; - -/** - * Adds fields appropriate for sorting: country, - * random_string and sort_field (int). - * - */ -public class SortableSimpleDocMaker extends SimpleDocMaker { - private int sortRange; - - private static String[] COUNTRIES = new String[] {"European Union", "United States", "Japan", "Germany", "China (PRC)", "United Kingdom", "France", "Italy", "Spain", "Canada", "Brazil", "Russia", "India", "South Korea", "Australia", "Mexico", "Netherlands", "Turkey", "Sweden", "Belgium", "Indonesia", "Switzerland", "Poland", "Norway", "Republic of China", "Saudi Arabia", "Austria", "Greece", "Denmark", "Iran", "South Africa", "Argentina", "Ireland", "Thailand", "Finland", "Venezuela", "Portugal", "Hong Kong", "United Arab Emirates", "Malaysia", "Czech Republic", "Colombia", "Nigeria", "Romania", "Chile", "Israel", "Singapore", "Philippines", "Pakistan", "Ukraine", "Hungary", "Algeria", "New Zealand", "Egypt", "Kuwait", "Peru", "Kazakhstan", "Slovakia", "Morocco", "Bangladesh", "Vietnam", "Qatar", "Angola", "Libya", "Iraq", "Croatia", "Luxembourg", "Sudan", "Slovenia", "Cuba", "Belarus", "Ecuador", "Serbia", "Oman", "Bulgaria", "Lithuania", "Syria", "Dominican Republic", "Tunisia", "Guatemala", "Azerbaijan", "Sri Lanka", "Kenya", "Latvia", "Turkmenistan", "Costa Rica", "Lebanon", "Uruguay", "Uzbekistan", "Yemen", "Cyprus", "Estonia", "Trinidad and Tobago", "Cameroon", "El Salvador", "Iceland", "Panama", "Bahrain", "Ivory Coast", "Ethiopia", "Tanzania", "Jordan", "Ghana", "Bosnia and Herzegovina", "Macau", "Burma", "Bolivia", "Brunei", "Botswana", "Honduras", "Gabon", "Uganda", "Jamaica", "Zambia", "Senegal", "Paraguay", "Albania", "Equatorial Guinea", "Georgia", "Democratic Republic of the Congo", "Nepal", "Afghanistan", "Cambodia", "Armenia", "Republic of the Congo", "Mozambique", "Republic of Macedonia", "Malta", "Namibia", "Madagascar", "Chad", "Burkina Faso", "Mauritius", "Mali", "The Bahamas", "Papua New Guinea", "Nicaragua", "Haiti", "Benin", "alestinian flag West Bank and Gaza", "Jersey", "Fiji", "Guinea", "Moldova", "Niger", "Laos", "Mongolia", "French Polynesia", "Kyrgyzstan", "Barbados", "Tajikistan", "Malawi", "Liechtenstein", "New Caledonia", "Kosovo", "Rwanda", "Montenegro", "Swaziland", "Guam", "Mauritania", "Guernsey", "Isle of Man", "Togo", "Somalia", "Suriname", "Aruba", "North Korea", "Zimbabwe", "Central African Republic", "Faroe Islands", "Greenland", "Sierra Leone", "Lesotho", "Cape Verde", "Eritrea", "Bhutan", "Belize", "Antigua and Barbuda", "Gibraltar", "Maldives", "San Marino", "Guyana", "Burundi", "Saint Lucia", "Djibouti", "British Virgin Islands", "Liberia", "Seychelles", "The Gambia", "Northern Mariana Islands", "Grenada", "Saint Vincent and the Grenadines", "Saint Kitts and Nevis", "East Timor", "Vanuatu", "Comoros", "Samoa", "Solomon Islands", "Guinea-Bissau", "American Samoa", "Dominica", "Micronesia", "Tonga", "Cook Islands", "Palau", "Marshall Islands", "S�o Tom� and Pr�ncipe", "Anguilla", "Kiribati", "Tuvalu", "Niue"}; - - protected DocData getNextDocData() throws NoMoreDataException { - Random r = new Random(); - DocData doc = super.getNextDocData(); - Properties props = new Properties(); - - // random int - props.put("sort_field", Integer.toString(nextInt(r, sortRange))); - - // random string - int len = nextInt(r, 2, 20); - char[] buffer = new char[len]; - for(int i=0;i + *
  • sort.rng - defines the range for sort-by-int field (default + * 20000). + *
  • rand.seed - defines the seed to initialize Random with (default + * 13). + * + */ +public class SortableSingleDocSource extends SingleDocSource { + + private static String[] COUNTRIES = new String[] { + "European Union", "United States", "Japan", "Germany", "China (PRC)", + "United Kingdom", "France", "Italy", "Spain", "Canada", "Brazil", "Russia", + "India", "South Korea", "Australia", "Mexico", "Netherlands", "Turkey", + "Sweden", "Belgium", "Indonesia", "Switzerland", "Poland", "Norway", + "Republic of China", "Saudi Arabia", "Austria", "Greece", "Denmark", "Iran", + "South Africa", "Argentina", "Ireland", "Thailand", "Finland", "Venezuela", + "Portugal", "Hong Kong", "United Arab Emirates", "Malaysia", + "Czech Republic", "Colombia", "Nigeria", "Romania", "Chile", "Israel", + "Singapore", "Philippines", "Pakistan", "Ukraine", "Hungary", "Algeria", + "New Zealand", "Egypt", "Kuwait", "Peru", "Kazakhstan", "Slovakia", + "Morocco", "Bangladesh", "Vietnam", "Qatar", "Angola", "Libya", "Iraq", + "Croatia", "Luxembourg", "Sudan", "Slovenia", "Cuba", "Belarus", "Ecuador", + "Serbia", "Oman", "Bulgaria", "Lithuania", "Syria", "Dominican Republic", + "Tunisia", "Guatemala", "Azerbaijan", "Sri Lanka", "Kenya", "Latvia", + "Turkmenistan", "Costa Rica", "Lebanon", "Uruguay", "Uzbekistan", "Yemen", + "Cyprus", "Estonia", "Trinidad and Tobago", "Cameroon", "El Salvador", + "Iceland", "Panama", "Bahrain", "Ivory Coast", "Ethiopia", "Tanzania", + "Jordan", "Ghana", "Bosnia and Herzegovina", "Macau", "Burma", "Bolivia", + "Brunei", "Botswana", "Honduras", "Gabon", "Uganda", "Jamaica", "Zambia", + "Senegal", "Paraguay", "Albania", "Equatorial Guinea", "Georgia", + "Democratic Republic of the Congo", "Nepal", "Afghanistan", "Cambodia", + "Armenia", "Republic of the Congo", "Mozambique", "Republic of Macedonia", + "Malta", "Namibia", "Madagascar", "Chad", "Burkina Faso", "Mauritius", + "Mali", "The Bahamas", "Papua New Guinea", "Nicaragua", "Haiti", "Benin", + "alestinian flag West Bank and Gaza", "Jersey", "Fiji", "Guinea", "Moldova", + "Niger", "Laos", "Mongolia", "French Polynesia", "Kyrgyzstan", "Barbados", + "Tajikistan", "Malawi", "Liechtenstein", "New Caledonia", "Kosovo", + "Rwanda", "Montenegro", "Swaziland", "Guam", "Mauritania", "Guernsey", + "Isle of Man", "Togo", "Somalia", "Suriname", "Aruba", "North Korea", + "Zimbabwe", "Central African Republic", "Faroe Islands", "Greenland", + "Sierra Leone", "Lesotho", "Cape Verde", "Eritrea", "Bhutan", "Belize", + "Antigua and Barbuda", "Gibraltar", "Maldives", "San Marino", "Guyana", + "Burundi", "Saint Lucia", "Djibouti", "British Virgin Islands", "Liberia", + "Seychelles", "The Gambia", "Northern Mariana Islands", "Grenada", + "Saint Vincent and the Grenadines", "Saint Kitts and Nevis", "East Timor", + "Vanuatu", "Comoros", "Samoa", "Solomon Islands", "Guinea-Bissau", + "American Samoa", "Dominica", "Micronesia", "Tonga", "Cook Islands", + "Palau", "Marshall Islands", "S�o Tom� and Pr�ncipe", "Anguilla", + "Kiribati", "Tuvalu", "Niue" }; + + private int sortRange; + private Random r; + + public DocData getNextDocData(DocData docData) throws NoMoreDataException { + docData = super.getNextDocData(docData); + Properties props = new Properties(); + + // random int + props.put("sort_field", Integer.toString(r.nextInt(sortRange))); + + // random string + int len = nextInt(2, 20); + char[] buffer = new char[len]; + for (int i = 0; i < len; i++) { + buffer[i] = (char) r.nextInt(0x80); + } + props.put("random_string", new String(buffer)); + + // random country + props.put("country", COUNTRIES[r.nextInt(COUNTRIES.length)]); + docData.setProps(props); + return docData; + } + + private int nextInt(int start, int end) { + return start + r.nextInt(end - start); + } + + public void setConfig(Config config) { + super.setConfig(config); + sortRange = config.get("sort.rng", 20000); + r = new Random(config.get("rand.seed", 13)); + } + +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java new file mode 100644 index 00000000000..64ff751720a --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java @@ -0,0 +1,339 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.text.DateFormat; +import java.text.ParsePosition; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.Locale; +import java.util.zip.GZIPInputStream; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.StringBufferReader; + +/** + * Implements a {@link ContentSource} over the TREC collection. + *

    + * Supports the following configuration parameters (on top of + * {@link ContentSource}): + *

      + *
    • work.dir - specifies the working directory. Required if "docs.dir" + * denotes a relative path (default=work). + *
    • docs.dir - specifies the directory where the TREC files reside. + * Can be set to a relative path if "work.dir" is also specified + * (default=trec). + *
    • html.parser - specifies the {@link HTMLParser} class to use for + * parsing the TREC documents content (default=DemoHTMLParser). + *
    + */ +public class TrecContentSource extends ContentSource { + // TODO (3.0): change StringBuffer to StringBuffer + + private static final class DateFormatInfo { + DateFormat[] dfs; + ParsePosition pos; + } + + private static final String DATE = "Date: "; + private static final String DOCHDR = ""; + private static final String TERMINATING_DOCHDR = ""; + private static final String DOCNO = ""; + private static final String TERMINATING_DOCNO = ""; + private static final String DOC = ""; + private static final String TERMINATING_DOC = ""; + + private static final String NEW_LINE = System.getProperty("line.separator"); + + private static final String DATE_FORMATS [] = { + "EEE, dd MMM yyyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT + "EEE MMM dd kk:mm:ss yyyy z", // Tue Dec 09 16:45:08 2003 EST + "EEE, dd-MMM-':'y kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT + "EEE, dd-MMM-yyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT + "EEE MMM dd kk:mm:ss yyyy", // Tue Dec 09 16:45:08 2003 + }; + + private ThreadLocal dateFormats = new ThreadLocal(); + private ThreadLocal trecDocReader = new ThreadLocal(); + private ThreadLocal trecDocBuffer = new ThreadLocal(); + private File dataDir = null; + private ArrayList inputFiles = new ArrayList(); + private int nextFile = 0; + private int rawDocSize; + + // Use to synchronize threads on reading from the TREC documents. + private Object lock = new Object(); + + // Required for test + BufferedReader reader; + int iteration = 0; + HTMLParser htmlParser; + + private DateFormatInfo getDateFormatInfo() { + DateFormatInfo dfi = (DateFormatInfo) dateFormats.get(); + if (dfi == null) { + dfi = new DateFormatInfo(); + dfi.dfs = new SimpleDateFormat[DATE_FORMATS.length]; + for (int i = 0; i < dfi.dfs.length; i++) { + dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.US); + dfi.dfs[i].setLenient(true); + } + dfi.pos = new ParsePosition(0); + dateFormats.set(dfi); + } + return dfi; + } + + private StringBuffer getDocBuffer() { + StringBuffer sb = (StringBuffer) trecDocBuffer.get(); + if (sb == null) { + sb = new StringBuffer(); + trecDocBuffer.set(sb); + } + return sb; + } + + private Reader getTrecDocReader(StringBuffer docBuffer) { + StringBufferReader r = (StringBufferReader) trecDocReader.get(); + if (r == null) { + r = new StringBufferReader(docBuffer); + trecDocReader.set(r); + } else { + r.set(docBuffer); + } + return r; + } + + // read until finding a line that starts with the specified prefix, or a terminating tag has been found. + private void read(StringBuffer buf, String prefix, boolean collectMatchLine, + boolean collectAll, String terminatingTag) + throws IOException, NoMoreDataException { + String sep = ""; + while (true) { + String line = reader.readLine(); + + if (line == null) { + openNextFile(); + continue; + } + + rawDocSize += line.length(); + + if (line.startsWith(prefix)) { + if (collectMatchLine) { + buf.append(sep).append(line); + sep = NEW_LINE; + } + break; + } + + if (terminatingTag != null && line.startsWith(terminatingTag)) { + // didn't find the prefix that was asked, but the terminating + // tag was found. set the length to 0 to signal no match was + // found. + buf.setLength(0); + break; + } + + if (collectAll) { + buf.append(sep).append(line); + sep = NEW_LINE; + } + } + } + + void openNextFile() throws NoMoreDataException, IOException { + close(); + int retries = 0; + while (true) { + if (nextFile >= inputFiles.size()) { + // exhausted files, start a new round, unless forever set to false. + if (!forever) { + throw new NoMoreDataException(); + } + nextFile = 0; + iteration++; + } + File f = (File) inputFiles.get(nextFile++); + if (verbose) { + System.out.println("opening: " + f + " length: " + f.length()); + } + try { + GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), 1 << 16); + reader = new BufferedReader(new InputStreamReader(zis), 1 << 16); + return; + } catch (Exception e) { + retries++; + if (retries < 20 && verbose) { + System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + " #retries=" + retries); + continue; + } + throw new NoMoreDataException(); + } + } + } + + Date parseDate(String dateStr) { + dateStr = dateStr.trim(); + DateFormatInfo dfi = getDateFormatInfo(); + for (int i = 0; i < dfi.dfs.length; i++) { + DateFormat df = dfi.dfs[i]; + dfi.pos.setIndex(0); + dfi.pos.setErrorIndex(-1); + Date d = df.parse(dateStr, dfi.pos); + if (d != null) { + // Parse succeeded. + return d; + } + } + // do not fail test just because a date could not be parsed + if (verbose) { + System.out.println("failed to parse date (assigning 'now') for: " + dateStr); + } + return null; + } + + public void close() throws IOException { + if (reader == null) { + return; + } + + try { + reader.close(); + } catch (IOException e) { + if (verbose) { + System.out.println("failed to close reader !"); + e.printStackTrace(System.out); + } + } + reader = null; + } + + public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { + String dateStr = null, name = null; + Reader r = null; + // protect reading from the TREC files by multiple threads. The rest of the + // method, i.e., parsing the content and returning the DocData can run + // unprotected. + synchronized (lock) { + if (reader == null) { + openNextFile(); + } + + StringBuffer docBuf = getDocBuffer(); + + // 1. skip until doc start + docBuf.setLength(0); + read(docBuf, DOC, false, false, null); + + // 2. name + docBuf.setLength(0); + read(docBuf, DOCNO, true, false, null); + name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, + DOCNO.length())); + name = name + "_" + iteration; + + // 3. skip until doc header + docBuf.setLength(0); + read(docBuf, DOCHDR, false, false, null); + + boolean findTerminatingDocHdr = false; + + // 4. date - look for the date only until /DOCHDR + docBuf.setLength(0); + read(docBuf, DATE, true, false, TERMINATING_DOCHDR); + if (docBuf.length() != 0) { + // Date found. + dateStr = docBuf.substring(DATE.length()); + findTerminatingDocHdr = true; + } + + // 5. skip until end of doc header + if (findTerminatingDocHdr) { + docBuf.setLength(0); + read(docBuf, TERMINATING_DOCHDR, false, false, null); + } + + // 6. collect until end of doc + docBuf.setLength(0); + read(docBuf, TERMINATING_DOC, false, true, null); + + // 7. Set up a Reader over the read content + r = getTrecDocReader(docBuf); + // Resetting the thread's reader means it will reuse the instance + // allocated as well as re-read from docBuf. + r.reset(); + + // count char length of parsed html text (larger than the plain doc body text). + addBytes(docBuf.length()); + } + + // This code segment relies on HtmlParser being thread safe. When we get + // here, everything else is already private to that thread, so we're safe. + Date date = dateStr != null ? parseDate(dateStr) : null; + try { + docData = htmlParser.parse(docData, name, date, r, null); + addDoc(); + } catch (InterruptedException e) { + IOException ex = new IOException(e.getMessage()); + ex.initCause(e); + throw ex; + } + + return docData; + } + + public void resetInputs() throws IOException { + synchronized (lock) { + super.resetInputs(); + close(); + nextFile = 0; + iteration = 0; + } + } + + public void setConfig(Config config) { + super.setConfig(config); + File workDir = new File(config.get("work.dir", "work")); + String d = config.get("docs.dir", "trec"); + dataDir = new File(d); + if (!dataDir.isAbsolute()) { + dataDir = new File(workDir, d); + } + collectFiles(dataDir, inputFiles); + if (inputFiles.size() == 0) { + throw new IllegalArgumentException("No files in dataDir: " + dataDir); + } + try { + String parserClassName = config.get("html.parser", + "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser"); + htmlParser = (HTMLParser) Class.forName(parserClassName).newInstance(); + } catch (Exception e) { + // Should not get here. Throw runtime exception. + throw new RuntimeException(e); + } + } + +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java deleted file mode 100644 index 38494c163ef..00000000000 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java +++ /dev/null @@ -1,262 +0,0 @@ -package org.apache.lucene.benchmark.byTask.feeds; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedInputStream; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.text.DateFormat; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.Locale; -import java.util.zip.GZIPInputStream; - -import org.apache.lucene.benchmark.byTask.utils.Config; - -/** - * A DocMaker using the (compressed) Trec collection for its input. - *

    - * Config properties:

      - *
    • work.dir=<path to the root of docs and indexes dirs| Default: work>
    • - *
    • docs.dir=<path to the docs dir| Default: trec>
    • - *
    - */ -public class TrecDocMaker extends BasicDocMaker { - - private static final String DATE = "Date: "; - private static final String DOCHDR = ""; - private static final String TERM_DOCHDR = ""; - private static final String TERM_DOCNO = ""; - private static final String DOCNO = ""; - private static final String TERM_DOC = ""; - private static final String DOC = ""; - private static final String NEW_LINE = System.getProperty("line.separator"); - - protected ThreadLocal dateFormat = new ThreadLocal(); - protected File dataDir = null; - protected ArrayList inputFiles = new ArrayList(); - protected int nextFile = 0; - protected int iteration=0; - protected BufferedReader reader; - private GZIPInputStream zis; - - private static final String DATE_FORMATS [] = { - "EEE, dd MMM yyyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT - "EEE MMM dd kk:mm:ss yyyy z", //Tue Dec 09 16:45:08 2003 EST - "EEE, dd-MMM-':'y kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT - "EEE, dd-MMM-yyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT - }; - - /* (non-Javadoc) - * @see SimpleDocMaker#setConfig(java.util.Properties) - */ - public void setConfig(Config config) { - super.setConfig(config); - File workDir = new File(config.get("work.dir","work")); - String d = config.get("docs.dir","trec"); - dataDir = new File(d); - if (!dataDir.isAbsolute()) { - dataDir = new File(workDir, d); - } - resetUniqueBytes(); - inputFiles.clear(); - collectFiles(dataDir,inputFiles); - if (inputFiles.size()==0) { - throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); - } - } - - protected void openNextFile() throws NoMoreDataException, Exception { - closeInputs(); - int retries = 0; - while (true) { - File f = null; - synchronized (this) { - if (nextFile >= inputFiles.size()) { - // exhausted files, start a new round, unless forever set to false. - if (!forever) { - throw new NoMoreDataException(); - } - nextFile = 0; - iteration++; - } - f = (File) inputFiles.get(nextFile++); - } - System.out.println("opening: "+f+" length: "+f.length()); - try { - zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f))); - reader = new BufferedReader(new InputStreamReader(zis)); - return; - } catch (Exception e) { - retries++; - if (retries<20) { - System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries); - continue; - } else { - throw new NoMoreDataException(); - } - } - } - } - - protected void closeInputs() { - if (zis!=null) { - try { - zis.close(); - } catch (IOException e) { - System.out.println("closeInputs(): Ingnoring error: "+e); - e.printStackTrace(); - } - zis = null; - } - if (reader!=null) { - try { - reader.close(); - } catch (IOException e) { - System.out.println("closeInputs(): Ingnoring error: "+e); - e.printStackTrace(); - } - reader = null; - } - } - - // read until finding a line that starts with the specified prefix - protected StringBuffer read(String prefix, StringBuffer sb, - boolean collectMatchLine, boolean collectAll, - String terminatingTag) throws Exception { - sb = (sb==null ? new StringBuffer() : sb); - String sep = ""; - while (true) { - String line = reader.readLine(); - if (line == null) { - openNextFile(); - continue; - } - if (line.startsWith(prefix)) { - if (collectMatchLine) { - sb.append(sep).append(line); - sep = NEW_LINE; - } - break; - } - - if (terminatingTag != null && line.startsWith(terminatingTag)) { - // didn't find the prefix that was asked, but the terminating - // tag was found. set the length to 0 to signal no match was - // found. - sb.setLength(0); - break; - } - - - if (collectAll) { - sb.append(sep).append(line); - sep = NEW_LINE; - } - } - //System.out.println("read: "+sb); - return sb; - } - - protected synchronized DocData getNextDocData() throws NoMoreDataException, Exception { - if (reader==null) { - openNextFile(); - } - // 1. skip until doc start - read(DOC,null,false,false,null); - // 2. name - StringBuffer sb = read(DOCNO,null,true,false,null); - String name = sb.substring(DOCNO.length(), sb.indexOf(TERM_DOCNO, DOCNO.length())); - name = name + "_" + iteration; - // 3. skip until doc header - read(DOCHDR,null,false,false,null); - boolean findTerminatingDocHdr = false; - // 4. date - sb = read(DATE,null,true,false,TERM_DOCHDR); - String dateStr = null; - if (sb.length() != 0) { - // Date found. - dateStr = sb.substring(DATE.length()); - findTerminatingDocHdr = true; - } - - // 5. skip until end of doc header - if (findTerminatingDocHdr) { - read(TERM_DOCHDR,null,false,false,null); - } - // 6. collect until end of doc - sb = read(TERM_DOC,null,false,true,null); - // this is the next document, so parse it - Date date = dateStr != null ? parseDate(dateStr) : null; - HTMLParser p = getHtmlParser(); - DocData docData = p.parse(name, date, sb, getDateFormat(0)); - addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text). - - return docData; - } - - protected DateFormat getDateFormat(int n) { - DateFormat df[] = (DateFormat[]) dateFormat.get(); - if (df == null) { - df = new SimpleDateFormat[DATE_FORMATS.length]; - for (int i = 0; i < df.length; i++) { - df[i] = new SimpleDateFormat(DATE_FORMATS[i],Locale.US); - df[i].setLenient(true); - } - dateFormat.set(df); - } - return df[n]; - } - - protected Date parseDate(String dateStr) { - for (int i = 0; i < DATE_FORMATS.length; i++) { - try { - return getDateFormat(i).parse(dateStr.trim()); - } catch (ParseException e) {} - } - // do not fail test just because a date could not be parsed - System.out.println("ignoring date parse exception (assigning 'null') for: "+dateStr); - return null; - } - - - /* - * (non-Javadoc) - * @see DocMaker#resetIinputs() - */ - public synchronized void resetInputs() { - super.resetInputs(); - closeInputs(); - nextFile = 0; - iteration = 0; - } - - /* - * (non-Javadoc) - * @see DocMaker#numUniqueTexts() - */ - public int numUniqueTexts() { - return inputFiles.size(); - } - -} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java index 482ef5b1de2..dbf99b65b90 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java @@ -80,9 +80,8 @@ public class Sample { Properties p = new Properties(); p.setProperty ( "task.max.depth.log" , "3" ); p.setProperty ( "max.buffered" , "buf:10:10:100:100:10:10:100:100" ); - p.setProperty ( "doc.maker" , "org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker" ); - p.setProperty ( "doc.add.log.step" , "2000" ); - p.setProperty ( "doc.delete.log.step" , "2000" ); + p.setProperty ( "doc.maker" , "org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource" ); + p.setProperty ( "log.step" , "2000" ); p.setProperty ( "doc.delete.step" , "8" ); p.setProperty ( "analyzer" , "org.apache.lucene.analysis.standard.StandardAnalyzer" ); p.setProperty ( "doc.term.vector" , "false" ); diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java index 5e1504678b6..433e663506f 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java @@ -20,38 +20,23 @@ package org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.document.Document; -import java.text.NumberFormat; - /** * Add a document, optionally with of a certain size. *
    Other side effects: none. - *
    Relevant properties: doc.add.log.step. *
    Takes optional param: document size. */ public class AddDocTask extends PerfTask { - /** - * Default value for property doc.add.log.step - indicating how often - * an "added N docs" message should be logged. - */ - public static final int DEFAULT_ADD_DOC_LOG_STEP = 500; - public AddDocTask(PerfRunData runData) { super(runData); } - private int logStep = -1; private int docSize = 0; - int count = 0; // volatile data passed between setup(), doLogic(), tearDown(). private Document doc = null; - /* - * (non-Javadoc) - * @see PerfTask#setup() - */ public void setup() throws Exception { super.setup(); DocMaker docMaker = getRunData().getDocMaker(); @@ -62,33 +47,20 @@ public class AddDocTask extends PerfTask { } } - /* (non-Javadoc) - * @see PerfTask#tearDown() - */ public void tearDown() throws Exception { - log(++count); doc = null; super.tearDown(); } + protected String getLogMessage(int recsCount) { + return "added " + recsCount + " docs"; + } + public int doLogic() throws Exception { getRunData().getIndexWriter().addDocument(doc); return 1; } - protected void log (int count) { - if (logStep<0) { - // init once per instance - logStep = getRunData().getConfig().get("doc.add.log.step",DEFAULT_ADD_DOC_LOG_STEP); - } - if (logStep>0 && (count%logStep)==0) { - double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0; - NumberFormat nf = NumberFormat.getInstance(); - nf.setMaximumFractionDigits(2); - System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs"); - } - } - /** * Set the params (docSize only) * @param params docSize, or 0 for no limit. diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java new file mode 100644 index 00000000000..e54b1881b95 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.java @@ -0,0 +1,67 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.ContentSource; +import org.apache.lucene.benchmark.byTask.feeds.DocData; +import org.apache.lucene.benchmark.byTask.utils.Config; + +/** + * Consumes a {@link org.apache.lucene.benchmark.byTask.feeds.ContentSource}. + * Supports the following parameters: + *
      + *
    • content.source - the content source to use. (mandatory) + *
    + */ +public class ConsumeContentSourceTask extends PerfTask { + + private ContentSource source; + private DocData dd = new DocData(); + + public ConsumeContentSourceTask(PerfRunData runData) { + super(runData); + Config config = runData.getConfig(); + String sourceClass = config.get("content.source", null); + if (sourceClass == null) { + throw new IllegalArgumentException("content.source must be defined"); + } + try { + source = (ContentSource) Class.forName(sourceClass).newInstance(); + source.setConfig(config); + source.resetInputs(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + protected String getLogMessage(int recsCount) { + return "read " + recsCount + " documents from the content source"; + } + + public void close() throws Exception { + source.close(); + super.close(); + } + + public int doLogic() throws Exception { + dd = source.getNextDocData(dd); + return 1; + } + +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java index f640b5a2034..1e4cb0c2817 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java @@ -22,7 +22,7 @@ import org.apache.lucene.benchmark.byTask.PerfRunData; /** * Delete a document by docid. *
    Other side effects: none. - *
    Relevant properties: doc.delete.log.step , doc.delete.step. + *
    Relevant properties: doc.delete.step, delete.log.step. *
    If no docid param is supplied, deletes doc with id = last-deleted-doc + doc.delete.step. *
    Takes optional param: document id. */ @@ -33,19 +33,16 @@ public class DeleteDocTask extends PerfTask { */ public static final int DEFAULT_DOC_DELETE_STEP = 8; - /** - * Default value for property doc.delete.log.step - indicating how often - * an "deleted N docs" message should be logged. - */ - public static final int DEFAULT_DELETE_DOC_LOG_STEP = 500; - public DeleteDocTask(PerfRunData runData) { super(runData); + // Override log.step, which is read by PerfTask + int deleteLogStep = runData.getConfig().get("delete.log.step", -1); + if (deleteLogStep != -1) { + logStep = deleteLogStep; + } } - private int logStep = -1; private int deleteStep = -1; - private static int numDeleted = 0; private static int lastDeleted = -1; private int docid = -1; @@ -62,10 +59,6 @@ public class DeleteDocTask extends PerfTask { */ public void setup() throws Exception { super.setup(); - // one time static initializations - if (logStep<0) { - logStep = getRunData().getConfig().get("doc.delete.log.step",DEFAULT_DELETE_DOC_LOG_STEP); - } if (deleteStep<0) { deleteStep = getRunData().getConfig().get("doc.delete.step",DEFAULT_DOC_DELETE_STEP); } @@ -73,18 +66,8 @@ public class DeleteDocTask extends PerfTask { docid = (byStep ? lastDeleted + deleteStep : docid); } - /* (non-Javadoc) - * @see PerfTask#tearDown() - */ - public void tearDown() throws Exception { - log(++numDeleted); - super.tearDown(); - } - - private void log (int count) { - if (logStep>0 && (count%logStep)==0) { - System.out.println("--> processed (delete) "+count+" docs, last deleted: "+lastDeleted); - } + protected String getLogMessage(int recsCount) { + return "deleted " + recsCount + " docs, last deleted: " + lastDeleted; } /** diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java index dc6ab720af1..b24a990e415 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java @@ -17,54 +17,80 @@ package org.apache.lucene.benchmark.byTask.tasks; * limitations under the License. */ +import java.text.NumberFormat; + import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.stats.Points; import org.apache.lucene.benchmark.byTask.stats.TaskStats; +import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Format; /** - * A (abstract) task to be tested for performance. - *
    - * Every performance task extends this class, and provides its own doLogic() method, - * which performss the actual task. - *
    - * Tasks performing some work that should be measured for the task, can overide setup() and/or tearDown() and - * placed that work there. - *
    + * An abstract task to be tested for performance.
    + * Every performance task extends this class, and provides its own + * {@link #doLogic()} method, which performss the actual task.
    + * Tasks performing some work that should be measured for the task, can overide + * {@link #setup()} and/or {@link #tearDown()} and place that work there.
    * Relevant properties: task.max.depth.log. */ public abstract class PerfTask implements Cloneable { + private static final int DEFAULT_LOG_STEP = 1000; + private PerfRunData runData; // propeties that all tasks have private String name; private int depth = 0; + protected int logStep; + private int logStepCount = 0; private int maxDepthLogStart = 0; private boolean disableCounting = false; protected String params = null; protected static final String NEW_LINE = System.getProperty("line.separator"); - /** - * Should not be used externally - */ + /** Should not be used externally */ private PerfTask() { - name = Format.simpleName(getClass()); + name = Format.simpleName(getClass()); if (name.endsWith("Task")) { - name = name.substring(0,name.length()-4); + name = name.substring(0, name.length() - 4); } } + /** + * @deprecated will be removed in 3.0. checks if there are any obsolete + * settings, like doc.add.log.step and doc.delete.log.step and + * alerts the user. + */ + private void checkObsoleteSettings(Config config) { + if (config.get("doc.add.log.step", null) != null) { + throw new RuntimeException("doc.add.log.step is not supported anymore. " + + "Use log.step and refer to CHANGES to read on the recent API changes " + + "done to Benchmark's DocMaker and Task-based logging."); + } + + if (config.get("doc.delete.log.step", null) != null) { + throw new RuntimeException("doc.delete.log.step is not supported anymore. " + + "Use delete.log.step and refer to CHANGES to read on the recent API changes " + + "done to Benchmark's DocMaker and Task-based logging."); + } + } + public PerfTask(PerfRunData runData) { this(); this.runData = runData; - this.maxDepthLogStart = runData.getConfig().get("task.max.depth.log",0); + Config config = runData.getConfig(); + this.maxDepthLogStart = config.get("task.max.depth.log",0); + logStep = config.get("log.step", DEFAULT_LOG_STEP); + // To avoid the check 'if (logStep > 0)' in tearDown(). This effectively + // turns logging off. + if (logStep <= 0) { + logStep = Integer.MAX_VALUE; + } + checkObsoleteSettings(config); } - /* (non-Javadoc) - * @see java.lang.Object#clone() - */ protected Object clone() throws CloneNotSupportedException { // tasks having non primitive data structures should overide this. // otherwise parallel running of a task sequence might not run crrectly. @@ -173,6 +199,10 @@ public abstract class PerfTask implements Cloneable { return maxDepthLogStart; } + protected String getLogMessage(int recsCount) { + return "processed " + recsCount + " records"; + } + /** * Tasks that should never log at start can overide this. * @return true if this task should never log when it start. @@ -207,7 +237,14 @@ public abstract class PerfTask implements Cloneable { * Notice that higher level (sequence) tasks containing this task would then * measure larger time than the sum of their contained tasks. */ - public void tearDown () throws Exception { + public void tearDown() throws Exception { + if (++logStepCount % logStep == 0) { + double time = (System.currentTimeMillis() - runData.getStartTimeMillis()) / 1000.0; + NumberFormat nf = NumberFormat.getInstance(); + nf.setMaximumFractionDigits(2); + System.out.println(nf.format(time) + " sec --> " + + Thread.currentThread().getName() + " " + getLogMessage(logStepCount)); + } } /** diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java index a2f802a091a..850d2406cf9 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java @@ -17,58 +17,44 @@ package org.apache.lucene.benchmark.byTask.tasks; * limitations under the License. */ -import org.apache.lucene.benchmark.byTask.PerfRunData; -import org.apache.lucene.benchmark.byTask.feeds.DocMaker; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import java.text.NumberFormat; import java.io.Reader; import java.util.List; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.DocMaker; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; /** * Simple task to test performance of tokenizers. It just * creates a token stream for each field of the document and * read all tokens out of that stream. - *
    Relevant properties: doc.tokenize.log.step. */ public class ReadTokensTask extends PerfTask { - /** - * Default value for property doc.tokenize.log.step - indicating how often - * an "added N docs / M tokens" message should be logged. - */ - public static final int DEFAULT_DOC_LOG_STEP = 500; - public ReadTokensTask(PerfRunData runData) { super(runData); } - private int logStep = -1; - int count = 0; - int totalTokenCount = 0; + private int totalTokenCount = 0; // volatile data passed between setup(), doLogic(), tearDown(). private Document doc = null; - /* - * (non-Javadoc) - * @see PerfTask#setup() - */ public void setup() throws Exception { super.setup(); DocMaker docMaker = getRunData().getDocMaker(); doc = docMaker.makeDocument(); } - /* (non-Javadoc) - * @see PerfTask#tearDown() - */ + protected String getLogMessage(int recsCount) { + return "read " + recsCount + " docs; " + totalTokenCount + " tokens"; + } + public void tearDown() throws Exception { - log(++count); doc = null; super.tearDown(); } @@ -117,19 +103,6 @@ public class ReadTokensTask extends PerfTask { return tokenCount; } - private void log(int count) { - if (logStep<0) { - // init once per instance - logStep = getRunData().getConfig().get("doc.tokenize.log.step", DEFAULT_DOC_LOG_STEP); - } - if (logStep>0 && (count%logStep)==0) { - double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0; - NumberFormat nf = NumberFormat.getInstance(); - nf.setMaximumFractionDigits(2); - System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs" + "; " + totalTokenCount + " tokens"); - } - } - /* Simple StringReader that can be reset to a new string; * we use this when tokenizing the string value from a * Field. */ diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java index 0d0104045a9..3bea29baa2d 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java @@ -62,6 +62,7 @@ public class TaskSequence extends PerfTask { for(int i=0;iOther side effects: none. - *
    Relevant properties: doc.add.log.step. *
    Takes optional param: document size. */ public class UpdateDocTask extends PerfTask { @@ -38,17 +34,11 @@ public class UpdateDocTask extends PerfTask { super(runData); } - private int logStep = -1; private int docSize = 0; - int count = 0; // volatile data passed between setup(), doLogic(), tearDown(). private Document doc = null; - /* - * (non-Javadoc) - * @see PerfTask#setup() - */ public void setup() throws Exception { super.setup(); DocMaker docMaker = getRunData().getDocMaker(); @@ -59,38 +49,24 @@ public class UpdateDocTask extends PerfTask { } } - /* (non-Javadoc) - * @see PerfTask#tearDown() - */ public void tearDown() throws Exception { - log(++count); doc = null; super.tearDown(); } public int doLogic() throws Exception { - final String docID = doc.get(BasicDocMaker.ID_FIELD); + final String docID = doc.get(DocMaker.ID_FIELD); if (docID == null) { throw new IllegalStateException("document must define the docid field"); } - getRunData().getIndexWriter().updateDocument(new Term(BasicDocMaker.ID_FIELD, docID), - doc); + getRunData().getIndexWriter().updateDocument(new Term(DocMaker.ID_FIELD, docID), doc); return 1; } - private void log (int count) { - if (logStep<0) { - // init once per instance - logStep = getRunData().getConfig().get("doc.add.log.step",AddDocTask.DEFAULT_ADD_DOC_LOG_STEP); - } - if (logStep>0 && (count%logStep)==0) { - double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0; - NumberFormat nf = NumberFormat.getInstance(); - nf.setMaximumFractionDigits(2); - System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (update) "+count+" docs"); - } + protected String getLogMessage(int recsCount) { + return "updated " + recsCount + " docs"; } - + /** * Set the params (docSize only) * @param params docSize, or 0 for no limit. diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java index f425e0ef7a1..70d64be470d 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java @@ -25,7 +25,6 @@ import java.io.OutputStreamWriter; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.byTask.PerfRunData; -import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; @@ -45,23 +44,13 @@ import org.apache.lucene.document.Field; *
  • bzip.compression - whether the output should be bzip-compressed. This is * recommended when the output file is expected to be large. (optional, default: * false). - *
  • doc.writeline.log.step - controls how many records to process before - * logging the status of the task. NOTE: to disable logging, set this - * value to 0 or negative. (optional, default:1000). * */ public class WriteLineDocTask extends PerfTask { - /** - * Default value for property doc.add.log.step - indicating how often - * an "added N docs" message should be logged. - */ - public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000; public final static char SEP = '\t'; - private int logStep = -1; private int docSize = 0; - int count = 0; private BufferedWriter lineFileOut = null; private DocMaker docMaker; @@ -93,30 +82,23 @@ public class WriteLineDocTask extends PerfTask { } lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16); docMaker = runData.getDocMaker(); - logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP); - // To avoid the check 'if (logStep > 0)' in log(). This effectively turns - // logging off. - if (logStep <= 0) { - logStep = Integer.MAX_VALUE; - } } - public void tearDown() throws Exception { - log(++count); - super.tearDown(); + protected String getLogMessage(int recsCount) { + return "Wrote " + recsCount + " line docs"; } - + public int doLogic() throws Exception { Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument(); - Field f = doc.getField(BasicDocMaker.BODY_FIELD); + Field f = doc.getField(DocMaker.BODY_FIELD); String body = f != null ? f.stringValue().replace('\t', ' ') : null; if (body != null) { - f = doc.getField(BasicDocMaker.TITLE_FIELD); + f = doc.getField(DocMaker.TITLE_FIELD); String title = f != null ? f.stringValue().replace('\t', ' ') : ""; - f = doc.getField(BasicDocMaker.DATE_FIELD); + f = doc.getField(DocMaker.DATE_FIELD); String date = f != null ? f.stringValue().replace('\t', ' ') : ""; lineFileOut.write(title, 0, title.length()); @@ -129,17 +111,6 @@ public class WriteLineDocTask extends PerfTask { return 1; } - private void log(int count) { - // logStep is initialized in the ctor to a positive value. If the config - // file indicates no logging, or contains an invalid value, logStep is init - // to Integer.MAX_VALUE, so that logging will not occur (at least for the - // first Integer.MAX_VALUE records). - if (count % logStep == 0) { - System.out.println("--> " + Thread.currentThread().getName() - + " processed (write line) " + count + " docs"); - } - } - public void close() throws Exception { lineFileOut.close(); super.close(); @@ -156,9 +127,6 @@ public class WriteLineDocTask extends PerfTask { docSize = (int) Float.parseFloat(params); } - /* (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() - */ public boolean supportsParams() { return true; } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java new file mode 100644 index 00000000000..097b89f939f --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.java @@ -0,0 +1,173 @@ +package org.apache.lucene.benchmark.byTask.utils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +/** + * Implements a {@link Reader} over a {@link StringBuffer} instance. Although + * one can use {@link java.io.StringReader} by passing it + * {@link StringBuffer#toString()}, it is better to use this class, as it + * doesn't mark the passed-in {@link StringBuffer} as shared (which will cause + * inner char[] allocations at the next append() attempt).
    + * Notes: + *
      + *
    • This implementation assumes the underlying {@link StringBuffer} is not + * changed during the use of this {@link Reader} implementation. + *
    • This implementation is thread-safe. + *
    • The implementation looks very much like {@link java.io.StringReader} (for + * the right reasons). + *
    • If one wants to reuse that instance, then the following needs to be done: + *
      + * StringBuffer sb = new StringBuffer("some text");
      + * Reader reader = new StringBufferReader(sb);
      + * ... read from reader - dont close it ! ...
      + * sb.setLength(0);
      + * sb.append("some new text");
      + * reader.reset();
      + * ... read the new string from the reader ...
      + * 
      + *
    + */ +public class StringBufferReader extends Reader { + + // TODO (3.0): change to StringBuffer (including the name of the class) + + // The StringBuffer to read from. + private StringBuffer sb; + + // The length of 'sb'. + private int length; + + // The next position to read from the StringBuffer. + private int next = 0; + + // The mark position. The default value 0 means the start of the text. + private int mark = 0; + + public StringBufferReader(StringBuffer sb) { + set(sb); + } + + /** Check to make sure that the stream has not been closed. */ + private void ensureOpen() throws IOException { + if (sb == null) { + throw new IOException("Stream has already been closed"); + } + } + + public void close() { + synchronized (lock) { + sb = null; + } + } + + /** + * Mark the present position in the stream. Subsequent calls to reset() will + * reposition the stream to this point. + * + * @param readAheadLimit Limit on the number of characters that may be read + * while still preserving the mark. Because the stream's input comes + * from a StringBuffer, there is no actual limit, so this argument + * must not be negative, but is otherwise ignored. + * @exception IllegalArgumentException If readAheadLimit is < 0 + * @exception IOException If an I/O error occurs + */ + public void mark(int readAheadLimit) throws IOException { + if (readAheadLimit < 0){ + throw new IllegalArgumentException("Read-ahead limit cannpt be negative: " + readAheadLimit); + } + synchronized (lock) { + ensureOpen(); + mark = next; + } + } + + public boolean markSupported() { + return true; + } + + public int read() throws IOException { + synchronized (lock) { + ensureOpen(); + return next >= length ? -1 : sb.charAt(next++); + } + } + + public int read(char cbuf[], int off, int len) throws IOException { + synchronized (lock) { + ensureOpen(); + + // Validate parameters + if (off < 0 || off > cbuf.length || len < 0 || off + len > cbuf.length) { + throw new IndexOutOfBoundsException("off=" + off + " len=" + len + " cbuf.length=" + cbuf.length); + } + + if (len == 0) { + return 0; + } + + if (next >= length) { + return -1; + } + + int n = Math.min(length - next, len); + sb.getChars(next, next + n, cbuf, off); + next += n; + return n; + } + } + + public boolean ready() throws IOException { + synchronized (lock) { + ensureOpen(); + return true; + } + } + + public void reset() throws IOException { + synchronized (lock) { + ensureOpen(); + next = mark; + length = sb.length(); + } + } + + public void set(StringBuffer sb) { + synchronized (lock) { + this.sb = sb; + length = sb.length(); + } + } + public long skip(long ns) throws IOException { + synchronized (lock) { + ensureOpen(); + if (next >= length) { + return 0; + } + + // Bound skip by beginning and end of the source + long n = Math.min(length - next, ns); + n = Math.max(-next, n); + next += n; + return n; + } + } + +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java index 57e22de8389..75f68a65ef0 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java @@ -17,18 +17,17 @@ package org.apache.lucene.benchmark.utils; * limitations under the License. */ -import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Properties; + import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.Properties; - /** * Extract the downloaded Wikipedia dump into separate files for indexing. */ @@ -51,7 +50,6 @@ public class ExtractWikipedia { } } - public File directory(int count, File directory) { if (directory == null) { directory = outputDir; @@ -99,7 +97,8 @@ public class ExtractWikipedia { long start = System.currentTimeMillis(); try { while ((doc = docMaker.makeDocument()) != null) { - create(doc.get(BasicDocMaker.ID_FIELD), doc.get(BasicDocMaker.TITLE_FIELD), doc.get(BasicDocMaker.DATE_FIELD), doc.get(BasicDocMaker.BODY_FIELD)); + create(doc.get(DocMaker.ID_FIELD), doc.get(DocMaker.TITLE_FIELD), doc + .get(DocMaker.DATE_FIELD), doc.get(DocMaker.BODY_FIELD)); } } catch (NoMoreDataException e) { //continue @@ -130,7 +129,7 @@ public class ExtractWikipedia { Properties properties = new Properties(); properties.setProperty("docs.file", wikipedia.getAbsolutePath()); - properties.setProperty("doc.maker.forever", "false"); + properties.setProperty("content.source.forever", "false"); properties.setProperty("keep.image.only.docs", String.valueOf(keepImageOnlyDocs)); docMaker.setConfig(new Config(properties)); docMaker.resetInputs(); diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java index 0bf4ca220f0..d452045d98e 100755 --- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java @@ -17,6 +17,7 @@ package org.apache.lucene.benchmark.byTask; +import java.io.IOException; import java.io.StringReader; import java.io.File; import java.io.FileReader; @@ -26,7 +27,7 @@ import java.util.Iterator; import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; -import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker; +import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource; import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask; @@ -114,7 +115,7 @@ public class TestPerfTasksLogic extends TestCase { }; CountingSearchTestTask.numSearches = 0; - Benchmark benchmark = execBenchmark(algLines); + execBenchmark(algLines); assertTrue(CountingSearchTestTask.numSearches > 0); long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis; assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500); @@ -124,7 +125,7 @@ public class TestPerfTasksLogic extends TestCase { // 1. alg definition (required in every "logic" test) String algLines[] = { "doc.stored=true", - "doc.maker="+Reuters20DocMaker.class.getName(), + "content.source="+Reuters20ContentSource.class.getName(), "query.maker=" + ReutersQueryMaker.class.getName(), "ResetSystemErase", "CreateIndex", @@ -162,7 +163,7 @@ public class TestPerfTasksLogic extends TestCase { String algLines[] = { "doc.stored=true",//doc storage is required in order to have text to highlight "doc.term.vector.offsets=true", - "doc.maker="+Reuters20DocMaker.class.getName(), + "content.source="+Reuters20ContentSource.class.getName(), "query.maker=" + ReutersQueryMaker.class.getName(), "ResetSystemErase", "CreateIndex", @@ -199,7 +200,7 @@ public class TestPerfTasksLogic extends TestCase { // 1. alg definition (required in every "logic" test) String algLines[] = { "doc.stored=false", - "doc.maker="+Reuters20DocMaker.class.getName(), + "content.source="+Reuters20ContentSource.class.getName(), "query.maker=" + ReutersQueryMaker.class.getName(), "ResetSystemErase", "CreateIndex", @@ -227,14 +228,14 @@ public class TestPerfTasksLogic extends TestCase { /** * Test Exhasting Doc Maker logic */ - public void testExhaustDocMaker() throws Exception { + public void testExhaustContentSource() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker", - "doc.add.log.step=1", + "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource", + "content.source.log.step=1", "doc.term.vector=false", - "doc.maker.forever=false", + "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", @@ -274,10 +275,10 @@ public class TestPerfTasksLogic extends TestCase { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker="+Reuters20DocMaker.class.getName(), - "doc.add.log.step=3", + "content.source="+Reuters20ContentSource.class.getName(), + "content.source.log.step=3", "doc.term.vector=false", - "doc.maker.forever=false", + "content.source.forever=false", "directory=FSDirectory", "doc.stored=false", "doc.tokenized=false", @@ -292,7 +293,7 @@ public class TestPerfTasksLogic extends TestCase { // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); - int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. + int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } @@ -309,8 +310,8 @@ public class TestPerfTasksLogic extends TestCase { // Creates a line file with first 500 docs from reuters String algLines1[] = { "# ----- properties ", - "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker", - "doc.maker.forever=false", + "content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource", + "content.source.forever=false", "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'), "# ----- alg ", "{WriteLineDoc()}:" + NUM_TRY_DOCS, @@ -335,7 +336,7 @@ public class TestPerfTasksLogic extends TestCase { "analyzer=org.apache.lucene.analysis.SimpleAnalyzer", "doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker", "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'), - "doc.maker.forever=false", + "content.source.forever=false", "doc.reuse.fields=false", "autocommit=false", "ram.flush.mb=4", @@ -373,7 +374,7 @@ public class TestPerfTasksLogic extends TestCase { String algLines1[] = { "# ----- properties ", "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer", - "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker", + "content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource", "# ----- alg ", "{ReadTokens}: " + NUM_DOCS, "ResetSystemErase", @@ -421,10 +422,10 @@ public class TestPerfTasksLogic extends TestCase { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker="+Reuters20DocMaker.class.getName(), - "doc.add.log.step=3", + "content.source="+Reuters20ContentSource.class.getName(), + "content.source.log.step=3", "doc.term.vector=false", - "doc.maker.forever=false", + "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", @@ -442,7 +443,7 @@ public class TestPerfTasksLogic extends TestCase { // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); - int ndocsExpected = 2 * 20; // Reuters20DocMaker exhausts after 20 docs. + int ndocsExpected = 2 * 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } @@ -477,16 +478,19 @@ public class TestPerfTasksLogic extends TestCase { } /** use reuters and the exhaust mechanism, but to be faster, add 20 docs only... */ - public static class Reuters20DocMaker extends ReutersDocMaker { - private int nDocs=0; - protected synchronized DocData getNextDocData() throws Exception { - if (nDocs>=20 && !forever) { + public static class Reuters20ContentSource extends ReutersContentSource { + private int nDocs = 0; + + public synchronized DocData getNextDocData(DocData docData) + throws NoMoreDataException, IOException { + if (nDocs >= 20 && !forever) { throw new NoMoreDataException(); } nDocs++; - return super.getNextDocData(); + return super.getNextDocData(docData); } - public synchronized void resetInputs() { + + public synchronized void resetInputs() throws IOException { super.resetInputs(); nDocs = 0; } @@ -499,10 +503,10 @@ public class TestPerfTasksLogic extends TestCase { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker="+Reuters20DocMaker.class.getName(), - "doc.add.log.step=3", + "content.source="+Reuters20ContentSource.class.getName(), + "content.source.log.step=3", "doc.term.vector=false", - "doc.maker.forever=false", + "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", @@ -521,7 +525,7 @@ public class TestPerfTasksLogic extends TestCase { // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); - int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. + int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } @@ -533,12 +537,12 @@ public class TestPerfTasksLogic extends TestCase { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker="+Reuters20DocMaker.class.getName(), + "content.source="+Reuters20ContentSource.class.getName(), "ram.flush.mb=-1", "max.buffered=2", - "doc.add.log.step=3", + "content.source.log.step=3", "doc.term.vector=false", - "doc.maker.forever=false", + "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", @@ -557,7 +561,7 @@ public class TestPerfTasksLogic extends TestCase { // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); - int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. + int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } @@ -577,10 +581,10 @@ public class TestPerfTasksLogic extends TestCase { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker="+Reuters20DocMaker.class.getName(), - "doc.add.log.step=3", + "content.source="+Reuters20ContentSource.class.getName(), + "content.source.log.step=3", "doc.term.vector=false", - "doc.maker.forever=false", + "content.source.forever=false", "directory=RAMDirectory", "merge.scheduler=" + MyMergeScheduler.class.getName(), "doc.stored=false", @@ -601,7 +605,7 @@ public class TestPerfTasksLogic extends TestCase { // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); - int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. + int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } @@ -620,12 +624,12 @@ public class TestPerfTasksLogic extends TestCase { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker="+Reuters20DocMaker.class.getName(), - "doc.add.log.step=3", + "content.source="+Reuters20ContentSource.class.getName(), + "content.source.log.step=3", "ram.flush.mb=-1", "max.buffered=2", "doc.term.vector=false", - "doc.maker.forever=false", + "content.source.forever=false", "directory=RAMDirectory", "merge.policy=" + MyMergePolicy.class.getName(), "doc.stored=false", @@ -646,7 +650,7 @@ public class TestPerfTasksLogic extends TestCase { // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); - int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. + int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } @@ -658,13 +662,13 @@ public class TestPerfTasksLogic extends TestCase { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker="+Reuters20DocMaker.class.getName(), - "doc.add.log.step=3", + "content.source="+Reuters20ContentSource.class.getName(), + "content.source.log.step=3", "ram.flush.mb=-1", "max.buffered=2", "compound=cmpnd:true:false", "doc.term.vector=vector:false:true", - "doc.maker.forever=false", + "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "merge.factor=3", @@ -702,12 +706,12 @@ public class TestPerfTasksLogic extends TestCase { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", - "doc.maker="+Reuters20DocMaker.class.getName(), - "doc.add.log.step=3", + "content.source="+Reuters20ContentSource.class.getName(), + "content.source.log.step=3", "ram.flush.mb=-1", "max.buffered=3", "doc.term.vector=false", - "doc.maker.forever=false", + "content.source.forever=false", "directory=RAMDirectory", "merge.policy=org.apache.lucene.index.LogDocMergePolicy", "doc.stored=false", @@ -728,7 +732,7 @@ public class TestPerfTasksLogic extends TestCase { // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); - int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs. + int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); @@ -780,10 +784,10 @@ public class TestPerfTasksLogic extends TestCase { String dis = disable ? "-" : ""; return new String[] { "# ----- properties ", - "doc.maker="+Reuters20DocMaker.class.getName(), - "doc.add.log.step=30", + "content.source="+Reuters20ContentSource.class.getName(), + "content.source.log.step=30", "doc.term.vector=false", - "doc.maker.forever=false", + "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java index 0daa32820a8..b3d66feb6e2 100644 --- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocMakerTest.java @@ -111,35 +111,11 @@ public class LineDocMakerTest extends BenchmarkTestCase { doIndexAndSearchTest(file, false, null); } - public void testBZip2WithBzipCompressionDisabled() throws Exception { - File file = new File(getWorkDir(), "one-line.bz2"); - createBZ2LineFile(file); - - try { - doIndexAndSearchTest(file, true, "false"); - fail("Some exception should have been thrown !"); - } catch (Exception e) { - // expected. - } - } - public void testRegularFile() throws Exception { File file = new File(getWorkDir(), "one-line"); createRegularLineFile(file); doIndexAndSearchTest(file, false, null); } - - public void testRegularFileWithBZipCompressionEnabled() throws Exception { - File file = new File(getWorkDir(), "one-line"); - createRegularLineFile(file); - - try { - doIndexAndSearchTest(file, true, "true"); - fail("Some exception should have been thrown !"); - } catch (Exception e) { - // expected. - } - } public void testInvalidFormat() throws Exception { String[] testCases = new String[] { diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java similarity index 84% rename from contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java rename to contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java index e34d26504b2..d76d514a7e7 100644 --- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecDocMakerTest.java +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java @@ -18,24 +18,29 @@ package org.apache.lucene.benchmark.byTask.feeds; */ import java.io.BufferedReader; +import java.io.IOException; import java.io.StringReader; +import java.text.ParseException; import java.util.Date; import junit.framework.TestCase; -public class TrecDocMakerTest extends TestCase { +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.document.DateTools; + +public class TrecContentSourceTest extends TestCase { /** A TrecDocMaker which works on a String and not files. */ - private static class StringableTrecDocMaker extends TrecDocMaker { + private static class StringableTrecSource extends TrecContentSource { private String docs = null; - public StringableTrecDocMaker(String docs, boolean forever) { + public StringableTrecSource(String docs, boolean forever) { this.docs = docs; this.forever = forever; } - protected void openNextFile() throws NoMoreDataException, Exception { + protected void openNextFile() throws NoMoreDataException, IOException { if (reader != null) { if (!forever) { throw new NoMoreDataException(); @@ -46,20 +51,26 @@ public class TrecDocMakerTest extends TestCase { reader = new BufferedReader(new StringReader(docs)); } + public void setConfig(Config config) { + htmlParser = new DemoHTMLParser(); + } } - private void assertDocData(DocData dd, String expName, String expTitle, String expBody, Date expDate) { + private void assertDocData(DocData dd, String expName, String expTitle, + String expBody, Date expDate) + throws ParseException { assertNotNull(dd); assertEquals(expName, dd.getName()); assertEquals(expTitle, dd.getTitle()); assertTrue(dd.getBody().indexOf(expBody) != -1); - assertEquals(expDate, dd.getDate()); + Date date = dd.getDate() != null ? DateTools.stringToDate(dd.getDate()) : null; + assertEquals(expDate, date); } - private void assertNoMoreDataException(StringableTrecDocMaker stdm) throws Exception { + private void assertNoMoreDataException(StringableTrecSource stdm) throws Exception { boolean thrown = false; try { - stdm.getNextDocData(); + stdm.getNextDocData(null); } catch (NoMoreDataException e) { thrown = true; } @@ -93,14 +104,14 @@ public class TrecDocMakerTest extends TestCase { "\r\n" + "\r\n" + ""; - StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false); - stdm.setHTMLParser(new DemoHTMLParser()); - - DocData dd = stdm.getNextDocData(); - assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm + StringableTrecSource source = new StringableTrecSource(docs, false); + source.setConfig(null); + + DocData dd = source.getNextDocData(new DocData()); + assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); - assertNoMoreDataException(stdm); + assertNoMoreDataException(source); } public void testTwoDocuments() throws Exception { @@ -156,18 +167,18 @@ public class TrecDocMakerTest extends TestCase { "\r\n" + "\r\n" + ""; - StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false); - stdm.setHTMLParser(new DemoHTMLParser()); - - DocData dd = stdm.getNextDocData(); - assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm + StringableTrecSource source = new StringableTrecSource(docs, false); + source.setConfig(null); + + DocData dd = source.getNextDocData(new DocData()); + assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); - dd = stdm.getNextDocData(); - assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm + dd = source.getNextDocData(dd); + assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source .parseDate("Sun, 11 Jan 2009 08:01:00 GMT")); - assertNoMoreDataException(stdm); + assertNoMoreDataException(source); } // If a Date: attribute is missing, make sure the document is not skipped, but @@ -224,17 +235,17 @@ public class TrecDocMakerTest extends TestCase { "\r\n" + "\r\n" + ""; - StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false); - stdm.setHTMLParser(new DemoHTMLParser()); + StringableTrecSource source = new StringableTrecSource(docs, false); + source.setConfig(null); - DocData dd = stdm.getNextDocData(); + DocData dd = source.getNextDocData(new DocData()); assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null); - dd = stdm.getNextDocData(); - assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm + dd = source.getNextDocData(dd); + assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source .parseDate("Sun, 11 Jan 2009 08:01:00 GMT")); - assertNoMoreDataException(stdm); + assertNoMoreDataException(source); } // When a 'bad date' is input (unparsable date), make sure the DocData date is @@ -266,13 +277,13 @@ public class TrecDocMakerTest extends TestCase { "\r\n" + "\r\n" + ""; - StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false); - stdm.setHTMLParser(new DemoHTMLParser()); + StringableTrecSource source = new StringableTrecSource(docs, false); + source.setConfig(null); - DocData dd = stdm.getNextDocData(); + DocData dd = source.getNextDocData(new DocData()); assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null); - assertNoMoreDataException(stdm); + assertNoMoreDataException(source); } public void testForever() throws Exception { @@ -302,16 +313,16 @@ public class TrecDocMakerTest extends TestCase { "\r\n" + "\r\n" + ""; - StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, true); - stdm.setHTMLParser(new DemoHTMLParser()); + StringableTrecSource source = new StringableTrecSource(docs, true); + source.setConfig(null); - DocData dd = stdm.getNextDocData(); - assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm + DocData dd = source.getNextDocData(new DocData()); + assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); // same document, but the second iteration changes the name. - dd = stdm.getNextDocData(); - assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", stdm + dd = source.getNextDocData(dd); + assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); // Don't test that NoMoreDataException is thrown, since the forever flag is diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java index 52dbcf97914..dfc7967a1e1 100644 --- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java @@ -27,8 +27,8 @@ import java.util.Properties; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.byTask.PerfRunData; -import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; import org.apache.lucene.benchmark.byTask.feeds.DocData; +import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; @@ -40,7 +40,7 @@ import org.apache.lucene.document.Field.Store; public class WriteLineDocTaskTest extends BenchmarkTestCase { // class has to be public so that Class.forName.newInstance() will work - public static final class WriteLineDocMaker extends BasicDocMaker { + public static final class WriteLineDocMaker extends DocMaker { protected DocData getNextDocData() throws NoMoreDataException, Exception { throw new UnsupportedOperationException("not implemented"); diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java index 3cb6503666c..df39b0b52de 100644 --- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java @@ -23,7 +23,7 @@ import java.io.FileReader; import java.io.PrintWriter; import org.apache.lucene.benchmark.byTask.TestPerfTasksLogic; -import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker; +import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource; import org.apache.lucene.benchmark.quality.Judge; import org.apache.lucene.benchmark.quality.QualityQuery; import org.apache.lucene.benchmark.quality.QualityQueryParser; @@ -155,10 +155,10 @@ public class TestQualityRun extends TestCase { // 1. alg definition String algLines[] = { "# ----- properties ", - "doc.maker="+ReutersDocMaker.class.getName(), - "doc.add.log.step=2500", + "content.source="+ReutersContentSource.class.getName(), + "content.source.log.step=2500", "doc.term.vector=false", - "doc.maker.forever=false", + "content.source.forever=false", "directory=FSDirectory", "doc.stored=true", "doc.tokenized=true",