mirror of https://github.com/apache/lucene.git
LUCENE-1595: Separate DocMaker into DocMaker and ContentSource.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@786233 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
835c405be0
commit
d7d455246f
|
@ -3,6 +3,34 @@ Lucene Benchmark Contrib Change Log
|
|||
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
|
||||
|
||||
$Id:$
|
||||
6/17/09
|
||||
LUCENE-1595: This issue breaks previous external algorithms. DocMaker has been
|
||||
replaced with a concrete class which accepts a ContentSource for iterating over
|
||||
a content source's documents. Most of the old DocMakers were changed to a
|
||||
ContentSource implementation, and DocMaker is now a default document creation impl
|
||||
that provides an easy way for reusing fields. When [doc.maker] is not defined in
|
||||
an algorithm, the new DocMaker is the default. If you have .alg files which
|
||||
specify a DocMaker (like ReutersDocMaker), you should change the [doc.maker] line to:
|
||||
[content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource]
|
||||
|
||||
i.e.
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
becomes
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
becomes
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
|
||||
Also, PerfTask now logs a message in tearDown() rather than each Task doing its
|
||||
own logging. A new setting called [log.step] is consulted to determine how often
|
||||
to log. [doc.add.log.step] is no longer a valid setting. For easy migration of
|
||||
current .alg files, rename [doc.add.log.step] to [log.step] and [doc.delete.log.step]
|
||||
to [delete.log.step].
|
||||
|
||||
Additionally, [doc.maker.forever] should be changed to [content.source.forever].
|
||||
(Shai Erera via Mark Miller)
|
||||
|
||||
6/12/09
|
||||
LUCENE-1539: Added DeleteByPercentTask which enables deleting a
|
||||
percentage of documents and searching on them. Changed CommitIndex
|
||||
|
|
|
@ -30,13 +30,12 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=500
|
||||
log.step=500
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -38,7 +38,7 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=5000
|
||||
log.step=5000
|
||||
|
||||
docs.file=temp/enwiki-20070527-pages-articles.xml
|
||||
|
||||
|
|
|
@ -34,14 +34,13 @@ directory=FSDirectory
|
|||
doc.stored=stored:true:true:false:false
|
||||
doc.tokenized=true
|
||||
doc.term.vector=vector:true:true:false:false
|
||||
doc.add.log.step=500
|
||||
doc.delete.log.step=100
|
||||
log.step=500
|
||||
delete.log.step=100
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -29,13 +29,13 @@
|
|||
#
|
||||
|
||||
# Where to get documents from:
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
# Where to write the line file output:
|
||||
line.file.out=work/reuters.lines.txt
|
||||
|
||||
# Stop after processing the document feed once:
|
||||
doc.maker.forever=false
|
||||
content.source.forever=false
|
||||
|
||||
# -------------------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -25,13 +25,14 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=500
|
||||
log.step=500
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -32,14 +32,14 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=10000
|
||||
doc.delete.log.step=100
|
||||
log.step=10000
|
||||
delete.log.step=100
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -36,7 +36,7 @@ docs.file=temp/enwiki-20070527-pages-articles.xml
|
|||
line.file.out=work/enwiki.txt
|
||||
|
||||
# Stop after processing the document feed once:
|
||||
doc.maker.forever=false
|
||||
content.source.forever=false
|
||||
|
||||
# -------------------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -28,11 +28,11 @@ doc.tokenized=true
|
|||
doc.term.vector=true
|
||||
doc.term.vector.offsets=true
|
||||
doc.term.vector.positions=true
|
||||
doc.add.log.step=2000
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@ doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
|
|||
docs.file=work/reuters.lines.txt
|
||||
|
||||
# Process documents only once:
|
||||
doc.maker.forever=false
|
||||
content.source.forever=false
|
||||
|
||||
# -------------------------------------------------------------------------------------
|
||||
|
||||
|
|
|
@ -30,13 +30,13 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=2000
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -30,13 +30,13 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=2000
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -30,13 +30,13 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=2000
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -30,13 +30,13 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=2000
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -29,13 +29,13 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=500
|
||||
log.step=500
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -28,13 +28,13 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=500
|
||||
log.step=500
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
#/**
|
||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# * contributor license agreements. See the NOTICE file distributed with
|
||||
# * this work for additional information regarding copyright ownership.
|
||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# * (the "License"); you may not use this file except in compliance with
|
||||
# * the License. You may obtain a copy of the License at
|
||||
# *
|
||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||
# *
|
||||
# * Unless required by applicable law or agreed to in writing, software
|
||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
|
||||
#
|
||||
# This alg reads the information from a ContentSoruce. It is useful for
|
||||
# measuring the performance of a particular ContentSource implementation, or
|
||||
# gather baselines for operations like indexing (if reading from the content
|
||||
# source takes 'X' time, we cannot index faster).
|
||||
#
|
||||
# To use this, first cd to contrib/benchmark and then run:
|
||||
#
|
||||
# ant run-task -Dtask.alg=conf/readContentSource.alg
|
||||
#
|
||||
|
||||
# Where to get documents from:
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
|
||||
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
|
||||
|
||||
# Stop after processing the document feed once:
|
||||
content.source.forever=false
|
||||
|
||||
# Log messages every:
|
||||
log.step=100000
|
||||
|
||||
# -------------------------------------------------------------------------------------
|
||||
|
||||
# Process all documents, appending each one to the line file:
|
||||
{ ConsumeContentSource } : *
|
||||
|
||||
RepSumByPref ConsumeContentSource
|
|
@ -40,13 +40,13 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=500
|
||||
log.step=500
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -28,13 +28,13 @@ directory=FSDirectory
|
|||
doc.stored=false
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=500
|
||||
log.step=500
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleSloppyPhraseQueryMaker
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
|
|
|
@ -29,11 +29,11 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=100000
|
||||
log.step=100000
|
||||
|
||||
docs.dir=reuters-out
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.SortableSimpleDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
|
||||
|
|
|
@ -29,13 +29,13 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=2000
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -28,11 +28,11 @@ doc.tokenized=true
|
|||
doc.term.vector=false
|
||||
doc.term.vector.offsets=false
|
||||
doc.term.vector.positions=false
|
||||
doc.add.log.step=2000
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
||||
|
|
|
@ -28,11 +28,11 @@ doc.tokenized=true
|
|||
doc.term.vector=true
|
||||
doc.term.vector.offsets=true
|
||||
doc.term.vector.positions=true
|
||||
doc.add.log.step=2000
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
||||
|
|
|
@ -28,13 +28,13 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=2000
|
||||
log.step=2000
|
||||
|
||||
docs.dir=reuters-out
|
||||
#docs.dir=reuters-111
|
||||
|
||||
#doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
#content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
|
||||
#query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker
|
||||
query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
|
||||
|
|
|
@ -25,8 +25,8 @@
|
|||
# ant run-task -Dtask.alg=conf/tokenize.alg
|
||||
#
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
doc.maker.forever=false
|
||||
content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
|
||||
content.source.forever=false
|
||||
|
||||
|
||||
#
|
||||
|
|
|
@ -37,7 +37,7 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=5000
|
||||
log.step=5000
|
||||
|
||||
docs.file=temp/enwiki-20070527-pages-articles.xml
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=5000
|
||||
log.step=5000
|
||||
|
||||
docs.file=temp/enwiki-20070527-pages-articles.xml
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ directory=FSDirectory
|
|||
doc.stored=true
|
||||
doc.tokenized=true
|
||||
doc.term.vector=false
|
||||
doc.add.log.step=5000
|
||||
log.step=5000
|
||||
|
||||
docs.file=temp/enwiki-20070527-pages-articles.xml
|
||||
|
||||
|
|
|
@ -17,9 +17,13 @@ package org.apache.lucene.benchmark.byTask;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.HTMLParser;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
|
||||
import org.apache.lucene.benchmark.byTask.stats.Points;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.ReadTask;
|
||||
|
@ -33,11 +37,6 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/**
|
||||
* Data maintained by a performance test run.
|
||||
* <p>
|
||||
|
@ -62,7 +61,6 @@ public class PerfRunData {
|
|||
private Directory directory;
|
||||
private Analyzer analyzer;
|
||||
private DocMaker docMaker;
|
||||
private HTMLParser htmlParser;
|
||||
|
||||
// we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
|
||||
private HashMap readTaskQueryMaker;
|
||||
|
@ -82,14 +80,11 @@ public class PerfRunData {
|
|||
"org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance();
|
||||
// doc maker
|
||||
docMaker = (DocMaker) Class.forName(config.get("doc.maker",
|
||||
"org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker")).newInstance();
|
||||
"org.apache.lucene.benchmark.byTask.feeds.DocMaker")).newInstance();
|
||||
docMaker.setConfig(config);
|
||||
// query makers
|
||||
readTaskQueryMaker = new HashMap();
|
||||
qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker"));
|
||||
// html parser, used for some doc makers
|
||||
htmlParser = (HTMLParser) Class.forName(config.get("html.parser","org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser")).newInstance();
|
||||
docMaker.setHTMLParser(htmlParser);
|
||||
|
||||
// index stuff
|
||||
reinit(false);
|
||||
|
@ -229,9 +224,7 @@ public class PerfRunData {
|
|||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the docMaker.
|
||||
*/
|
||||
/** Returns the docMaker. */
|
||||
public DocMaker getDocMaker() {
|
||||
return docMaker;
|
||||
}
|
||||
|
@ -243,7 +236,7 @@ public class PerfRunData {
|
|||
return config;
|
||||
}
|
||||
|
||||
public void resetInputs() {
|
||||
public void resetInputs() throws IOException {
|
||||
docMaker.resetInputs();
|
||||
Iterator it = readTaskQueryMaker.values().iterator();
|
||||
while (it.hasNext()) {
|
||||
|
@ -271,11 +264,4 @@ public class PerfRunData {
|
|||
return qm;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the htmlParser.
|
||||
*/
|
||||
public HTMLParser getHtmlParser() {
|
||||
return htmlParser;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,335 +0,0 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Format;
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/**
|
||||
* Create documents for the test.
|
||||
* Maintains counters of chars etc. so that sub-classes just need to
|
||||
* provide textual content, and the create-by-size is handled here.
|
||||
*
|
||||
* <p/>
|
||||
* Config Params (default is in caps):
|
||||
* doc.stored=true|FALSE<br/>
|
||||
* doc.tokenized=TRUE|false<br/>
|
||||
* doc.term.vector=true|FALSE<br/>
|
||||
* doc.term.vector.positions=true|FALSE<br/>
|
||||
* doc.term.vector.offsets=true|FALSE<br/>
|
||||
* doc.store.body.bytes=true|FALSE //Store the body contents raw UTF-8 bytes as a field<br/>
|
||||
*/
|
||||
public abstract class BasicDocMaker implements DocMaker {
|
||||
|
||||
private int numDocsCreated = 0;
|
||||
private boolean storeBytes = false;
|
||||
protected boolean forever;
|
||||
|
||||
private static class LeftOver {
|
||||
private DocData docdata;
|
||||
private int cnt;
|
||||
}
|
||||
|
||||
// leftovers are thread local, because it is unsafe to share residues between threads
|
||||
private ThreadLocal leftovr = new ThreadLocal();
|
||||
|
||||
public static final String BODY_FIELD = "body";
|
||||
public static final String TITLE_FIELD = "doctitle";
|
||||
public static final String DATE_FIELD = "docdate";
|
||||
public static final String ID_FIELD = "docid";
|
||||
public static final String BYTES_FIELD = "bytes";
|
||||
public static final String NAME_FIELD = "docname";
|
||||
|
||||
private long numBytes = 0;
|
||||
private long numUniqueBytes = 0;
|
||||
|
||||
protected Config config;
|
||||
|
||||
protected Field.Store storeVal = Field.Store.NO;
|
||||
protected Field.Index indexVal = Field.Index.ANALYZED;
|
||||
protected Field.TermVector termVecVal = Field.TermVector.NO;
|
||||
|
||||
private synchronized int incrNumDocsCreated() {
|
||||
return numDocsCreated++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the data of the next document.
|
||||
* All current implementations can create docs forever.
|
||||
* When the input data is exhausted, input files are iterated.
|
||||
* This re-iteration can be avoided by setting doc.maker.forever to false (default is true).
|
||||
* @return data of the next document.
|
||||
* @exception if cannot create the next doc data
|
||||
* @exception NoMoreDataException if data is exhausted (and 'forever' set to false).
|
||||
*/
|
||||
protected abstract DocData getNextDocData() throws NoMoreDataException, Exception;
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument()
|
||||
*/
|
||||
public Document makeDocument () throws Exception {
|
||||
resetLeftovers();
|
||||
DocData docData = getNextDocData();
|
||||
Document doc = createDocument(docData,0,-1);
|
||||
return doc;
|
||||
}
|
||||
|
||||
// create a doc
|
||||
// use only part of the body, modify it to keep the rest (or use all if size==0).
|
||||
// reset the docdata properties so they are not added more than once.
|
||||
private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
|
||||
int docid = incrNumDocsCreated();
|
||||
Document doc = new Document();
|
||||
doc.add(new Field(ID_FIELD, "doc"+docid, storeVal, indexVal, termVecVal));
|
||||
if (docData.getName()!=null) {
|
||||
String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
|
||||
doc.add(new Field(NAME_FIELD, name, storeVal, indexVal, termVecVal));
|
||||
}
|
||||
if (docData.getDate()!=null) {
|
||||
String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
|
||||
doc.add(new Field(DATE_FIELD, dateStr, storeVal, indexVal, termVecVal));
|
||||
}
|
||||
if (docData.getTitle()!=null) {
|
||||
doc.add(new Field(TITLE_FIELD, docData.getTitle(), storeVal, indexVal, termVecVal));
|
||||
}
|
||||
if (docData.getBody()!=null && docData.getBody().length()>0) {
|
||||
String bdy;
|
||||
if (size<=0 || size>=docData.getBody().length()) {
|
||||
bdy = docData.getBody(); // use all
|
||||
docData.setBody(""); // nothing left
|
||||
} else {
|
||||
// attempt not to break words - if whitespace found within next 20 chars...
|
||||
for (int n=size-1; n<size+20 && n<docData.getBody().length(); n++) {
|
||||
if (Character.isWhitespace(docData.getBody().charAt(n))) {
|
||||
size = n;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bdy = docData.getBody().substring(0,size); // use part
|
||||
docData.setBody(docData.getBody().substring(size)); // some left
|
||||
}
|
||||
doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
|
||||
if (storeBytes == true) {
|
||||
doc.add(new Field(BYTES_FIELD, bdy.getBytes("UTF-8"), Field.Store.YES));
|
||||
}
|
||||
}
|
||||
|
||||
if (docData.getProps()!=null) {
|
||||
for (Iterator it = docData.getProps().keySet().iterator(); it.hasNext(); ) {
|
||||
String key = (String) it.next();
|
||||
String val = (String) docData.getProps().get(key);
|
||||
doc.add(new Field(key, val, storeVal, indexVal, termVecVal));
|
||||
}
|
||||
docData.setProps(null);
|
||||
}
|
||||
//System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
|
||||
return doc;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument(int)
|
||||
*/
|
||||
public Document makeDocument(int size) throws Exception {
|
||||
LeftOver lvr = (LeftOver) leftovr.get();
|
||||
if (lvr==null || lvr.docdata==null || lvr.docdata.getBody()==null || lvr.docdata.getBody().length()==0) {
|
||||
resetLeftovers();
|
||||
}
|
||||
DocData dd = (lvr==null ? getNextDocData() : lvr.docdata);
|
||||
int cnt = (lvr==null ? 0 : lvr.cnt);
|
||||
while (dd.getBody()==null || dd.getBody().length()<size) {
|
||||
DocData dd2 = dd;
|
||||
dd = getNextDocData();
|
||||
cnt = 0;
|
||||
dd.setBody(dd2.getBody() + dd.getBody());
|
||||
}
|
||||
Document doc = createDocument(dd,size,cnt);
|
||||
if (dd.getBody()==null || dd.getBody().length()==0) {
|
||||
resetLeftovers();
|
||||
} else {
|
||||
if (lvr == null) {
|
||||
lvr = new LeftOver();
|
||||
leftovr.set(lvr);
|
||||
}
|
||||
lvr.docdata = dd;
|
||||
lvr.cnt = ++cnt;
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
private void resetLeftovers() {
|
||||
leftovr.set(null);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see DocMaker#setConfig(java.util.Properties)
|
||||
*/
|
||||
public void setConfig(Config config) {
|
||||
this.config = config;
|
||||
boolean stored = config.get("doc.stored",false);
|
||||
boolean tokenized = config.get("doc.tokenized",true);
|
||||
boolean termVec = config.get("doc.term.vector",false);
|
||||
storeVal = (stored ? Field.Store.YES : Field.Store.NO);
|
||||
indexVal = (tokenized ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED);
|
||||
boolean termVecPositions = config.get("doc.term.vector.positions",false);
|
||||
boolean termVecOffsets = config.get("doc.term.vector.offsets",false);
|
||||
if (termVecPositions && termVecOffsets)
|
||||
termVecVal = Field.TermVector.WITH_POSITIONS_OFFSETS;
|
||||
else if (termVecPositions)
|
||||
termVecVal = Field.TermVector.WITH_POSITIONS;
|
||||
else if (termVecOffsets)
|
||||
termVecVal = Field.TermVector.WITH_OFFSETS;
|
||||
else if (termVec)
|
||||
termVecVal = Field.TermVector.YES;
|
||||
else
|
||||
termVecVal = Field.TermVector.NO;
|
||||
storeBytes = config.get("doc.store.body.bytes", false);
|
||||
forever = config.get("doc.maker.forever",true);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#resetIinputs()
|
||||
*/
|
||||
public synchronized void resetInputs() {
|
||||
printDocStatistics();
|
||||
setConfig(config); //re-initiate since properties by round may have changed.
|
||||
numBytes = 0;
|
||||
numDocsCreated = 0;
|
||||
resetLeftovers();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#numUniqueBytes()
|
||||
*/
|
||||
public long numUniqueBytes() {
|
||||
return numUniqueBytes;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#getCount()
|
||||
*/
|
||||
public synchronized int getCount() {
|
||||
return numDocsCreated;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#getByteCount()
|
||||
*/
|
||||
public synchronized long getByteCount() {
|
||||
return numBytes;
|
||||
}
|
||||
|
||||
protected void addUniqueBytes (long n) {
|
||||
numUniqueBytes += n;
|
||||
}
|
||||
|
||||
protected void resetUniqueBytes () {
|
||||
numUniqueBytes = 0;
|
||||
}
|
||||
|
||||
protected synchronized void addBytes (long n) {
|
||||
numBytes += n;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#printDocStatistics()
|
||||
*/
|
||||
private int lastPrintedNumUniqueTexts = 0;
|
||||
private long lastPrintedNumUniqueBytes = 0;
|
||||
private int printNum = 0;
|
||||
private HTMLParser htmlParser;
|
||||
|
||||
public void printDocStatistics() {
|
||||
boolean print = false;
|
||||
String col = " ";
|
||||
StringBuffer sb = new StringBuffer();
|
||||
String newline = System.getProperty("line.separator");
|
||||
sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline);
|
||||
int nut = numUniqueTexts();
|
||||
if (nut > lastPrintedNumUniqueTexts) {
|
||||
print = true;
|
||||
sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline);
|
||||
lastPrintedNumUniqueTexts = nut;
|
||||
}
|
||||
long nub = numUniqueBytes();
|
||||
if (nub > lastPrintedNumUniqueBytes) {
|
||||
print = true;
|
||||
sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
|
||||
lastPrintedNumUniqueBytes = nub;
|
||||
}
|
||||
if (getCount()>0) {
|
||||
print = true;
|
||||
sb.append("num docs added since last inputs reset: ").append(Format.format(0,getCount(),col)).append(newline);
|
||||
sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getByteCount(),col)).append(newline);
|
||||
}
|
||||
if (print) {
|
||||
System.out.println(sb.append(newline).toString());
|
||||
printNum++;
|
||||
}
|
||||
}
|
||||
|
||||
protected void collectFiles(File f, ArrayList inputFiles) {
|
||||
//System.out.println("Collect: "+f.getAbsolutePath());
|
||||
if (!f.canRead()) {
|
||||
return;
|
||||
}
|
||||
if (f.isDirectory()) {
|
||||
String files[] = f.list();
|
||||
Arrays.sort(files);
|
||||
for (int i = 0; i < files.length; i++) {
|
||||
collectFiles(new File(f,files[i]),inputFiles);
|
||||
}
|
||||
return;
|
||||
}
|
||||
inputFiles.add(f);
|
||||
addUniqueBytes(f.length());
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#setHTMLParser(org.apache.lucene.benchmark.byTask.feeds.HTMLParser)
|
||||
*/
|
||||
public void setHTMLParser(HTMLParser htmlParser) {
|
||||
this.htmlParser = htmlParser;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#getHtmlParser()
|
||||
*/
|
||||
public HTMLParser getHtmlParser() {
|
||||
return htmlParser;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorException;
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
/**
|
||||
* Represents content from a specified source, such as TREC, Reuters etc. A
|
||||
* {@link ContentSource} is responsible for creating {@link DocData} objects for
|
||||
* its documents to be consumed by {@link ToDeleteDocMaker}. It also keeps track of
|
||||
* various statistics, such as how many documents were generated, size in bytes
|
||||
* etc.
|
||||
* <p>
|
||||
* Supports the following configuration parameters:
|
||||
* <ul>
|
||||
* <li><b>content.source.forever</b> - specifies whether to generate documents
|
||||
* forever (<b>default=true</b>).
|
||||
* <li><b>content.source.verbose</b> - specifies whether messages should be
|
||||
* output by the content source (<b>default=false</b>).
|
||||
* <li><b>content.source.log.step</b> - specifies for how many documents a
|
||||
* message should be logged. If set to 0 it means no logging should occur.
|
||||
* <b>NOTE:</b> if verbose is set to false, logging should not occur even if
|
||||
* logStep is not 0 (<b>default=0</b>).
|
||||
* </ul>
|
||||
*/
|
||||
public abstract class ContentSource {
|
||||
|
||||
private static final int BZIP = 0;
|
||||
private static final int OTHER = 1;
|
||||
private static final Map extensionToType = new HashMap();
|
||||
static {
|
||||
extensionToType.put(".bz2", Integer.valueOf(BZIP));
|
||||
extensionToType.put(".bzip", Integer.valueOf(BZIP));
|
||||
}
|
||||
|
||||
protected static final int BUFFER_SIZE = 1 << 16; // 64K
|
||||
|
||||
private long bytesCount;
|
||||
private long totalBytesCount;
|
||||
private int docsCount;
|
||||
private int totalDocsCount;
|
||||
private Config config;
|
||||
|
||||
protected boolean forever;
|
||||
protected int logStep;
|
||||
protected boolean verbose;
|
||||
|
||||
private CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
protected final synchronized void addBytes(long numBytes) {
|
||||
bytesCount += numBytes;
|
||||
totalBytesCount += numBytes;
|
||||
}
|
||||
|
||||
protected final synchronized void addDoc() {
|
||||
++docsCount;
|
||||
++totalDocsCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* A convenience method for collecting all the files of a content source from
|
||||
* a given directory. The collected {@link File} instances are stored in the
|
||||
* given <code>files</code>.
|
||||
*/
|
||||
protected final void collectFiles(File dir, ArrayList files) {
|
||||
if (!dir.canRead()) {
|
||||
return;
|
||||
}
|
||||
|
||||
File[] dirFiles = dir.listFiles();
|
||||
Arrays.sort(dirFiles);
|
||||
for (int i = 0; i < dirFiles.length; i++) {
|
||||
File file = dirFiles[i];
|
||||
if (file.isDirectory()) {
|
||||
collectFiles(file, files);
|
||||
} else if (file.canRead()) {
|
||||
files.add(file);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an {@link InputStream} over the requested file. This method
|
||||
* attempts to identify the appropriate {@link InputStream} instance to return
|
||||
* based on the file name (e.g., if it ends with .bz2 or .bzip, return a
|
||||
* 'bzip' {@link InputStream}).
|
||||
*/
|
||||
protected InputStream getInputStream(File file) throws IOException {
|
||||
// First, create a FileInputStream, as this will be required by all types.
|
||||
// Wrap with BufferedInputStream for better performance
|
||||
InputStream is = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE);
|
||||
|
||||
String fileName = file.getName();
|
||||
int idx = fileName.lastIndexOf('.');
|
||||
int type = OTHER;
|
||||
if (idx != -1) {
|
||||
Integer typeInt = (Integer) extensionToType.get(fileName.substring(idx));
|
||||
if (typeInt != null) {
|
||||
type = typeInt.intValue();
|
||||
}
|
||||
}
|
||||
switch (type) {
|
||||
case BZIP:
|
||||
try {
|
||||
// According to BZip2CompressorInputStream's code, it reads the first
|
||||
// two file header chars ('B' and 'Z'). It is important to wrap the
|
||||
// underlying input stream with a buffered one since
|
||||
// Bzip2CompressorInputStream uses the read() method exclusively.
|
||||
is = csFactory.createCompressorInputStream("bzip2", is);
|
||||
} catch (CompressorException e) {
|
||||
IOException ioe = new IOException(e.getMessage());
|
||||
ioe.initCause(e);
|
||||
throw ioe;
|
||||
}
|
||||
break;
|
||||
default: // Do nothing, stay with FileInputStream
|
||||
}
|
||||
|
||||
return is;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true whether it's time to log a message (depending on verbose and
|
||||
* the number of documents generated).
|
||||
*/
|
||||
protected final boolean shouldLog() {
|
||||
return verbose && logStep > 0 && docsCount % logStep == 0;
|
||||
}
|
||||
|
||||
/** Called when reading from this content source is no longer required. */
|
||||
public abstract void close() throws IOException;
|
||||
|
||||
/** Returns the number of bytes generated since last reset. */
|
||||
public final long getBytesCount() { return bytesCount; }
|
||||
|
||||
/** Returns the number of generated documents since last reset. */
|
||||
public final int getDocsCount() { return docsCount; }
|
||||
|
||||
public final Config getConfig() { return config; }
|
||||
|
||||
/** Returns the next {@link DocData} from the content source. */
|
||||
public abstract DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException;
|
||||
|
||||
/** Returns the total number of bytes that were generated by this source. */
|
||||
public final long getTotalBytesCount() { return totalBytesCount; }
|
||||
|
||||
/** Returns the total number of generated documents. */
|
||||
public final int getTotalDocsCount() { return totalDocsCount; }
|
||||
|
||||
/**
|
||||
* Resets the input for this content source, so that the test would behave as
|
||||
* if it was just started, input-wise.
|
||||
* <p>
|
||||
* <b>NOTE:</b> the default implementation resets the number of bytes and
|
||||
* documents generated since the last reset, so it's important to call
|
||||
* super.resetInputs in case you override this method.
|
||||
*/
|
||||
public void resetInputs() throws IOException {
|
||||
bytesCount = 0;
|
||||
docsCount = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the {@link Config} for this content source. If you override this
|
||||
* method, you must call super.setConfig.
|
||||
*/
|
||||
public void setConfig(Config config) {
|
||||
this.config = config;
|
||||
forever = config.get("content.source.forever", true);
|
||||
logStep = config.get("content.source.log.step", 0);
|
||||
verbose = config.get("content.source.verbose", false);
|
||||
}
|
||||
|
||||
}
|
|
@ -30,14 +30,7 @@ import java.util.Properties;
|
|||
*/
|
||||
public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
|
||||
|
||||
public DemoHTMLParser () {
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.io.Reader, java.text.DateFormat)
|
||||
*/
|
||||
public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
|
||||
public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
|
||||
org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader);
|
||||
|
||||
// title
|
||||
|
@ -64,16 +57,22 @@ public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.
|
|||
date = new Date(); // now
|
||||
}
|
||||
}
|
||||
|
||||
return new DocData(name, bodyBuf.toString(), title, props, date);
|
||||
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setBody(bodyBuf.toString());
|
||||
docData.setTitle(title);
|
||||
docData.setProps(props);
|
||||
docData.setDate(date);
|
||||
return docData;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.lang.StringBuffer, java.text.DateFormat)
|
||||
*/
|
||||
public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException {
|
||||
return parse(name, date, new StringReader(inputText.toString()), dateFormat);
|
||||
public DocData parse(DocData docData, String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException {
|
||||
return parse(docData, name, date, new StringReader(inputText.toString()), dateFormat);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,7 +23,9 @@ import java.io.BufferedReader;
|
|||
import java.io.File;
|
||||
import java.io.FileFilter;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParsePosition;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
|
@ -31,31 +33,25 @@ import java.util.Locale;
|
|||
import java.util.Stack;
|
||||
|
||||
/**
|
||||
* A DocMaker using the Dir collection for its input.
|
||||
*
|
||||
* Config properties:
|
||||
* docs.dir=<path to the docs dir| Default: dir-out>
|
||||
|
||||
*
|
||||
* A {@link ContentSource} using the Dir collection for its input. Supports
|
||||
* the following configuration parameters (on top of {@link ContentSource}):
|
||||
* <ul>
|
||||
* <li><b>work.dir</b> - specifies the working directory. Required if "docs.dir"
|
||||
* denotes a relative path (<b>default=work</b>).
|
||||
* <li><b>docs.dir</b> - specifies the directory the Dir collection. Can be set
|
||||
* to a relative path if "work.dir" is also specified (<b>default=dir-out</b>).
|
||||
* <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
|
||||
* parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
|
||||
* </ul>
|
||||
*/
|
||||
public class DirDocMaker extends BasicDocMaker {
|
||||
public class DirContentSource extends ContentSource {
|
||||
|
||||
protected ThreadLocal dateFormat = new ThreadLocal();
|
||||
protected File dataDir = null;
|
||||
protected int iteration=0;
|
||||
private static final class DateFormatInfo {
|
||||
DateFormat df;
|
||||
ParsePosition pos;
|
||||
}
|
||||
|
||||
static public class Iterator implements java.util.Iterator {
|
||||
|
||||
int count = 0;
|
||||
|
||||
public int getCount(){
|
||||
return count;
|
||||
}
|
||||
|
||||
Stack stack = new Stack();
|
||||
|
||||
/* this seems silly ... there must be a better way ...
|
||||
not that this is good, but can it matter? */
|
||||
public static class Iterator implements java.util.Iterator {
|
||||
|
||||
static class Comparator implements java.util.Comparator {
|
||||
public int compare(Object _a, Object _b) {
|
||||
|
@ -81,22 +77,17 @@ public class DirDocMaker extends BasicDocMaker {
|
|||
}
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
|
||||
Stack stack = new Stack();
|
||||
|
||||
/* this seems silly ... there must be a better way ...
|
||||
not that this is good, but can it matter? */
|
||||
|
||||
Comparator c = new Comparator();
|
||||
|
||||
void push(File[] files) {
|
||||
Arrays.sort(files, c);
|
||||
for(int i = 0; i < files.length; i++) {
|
||||
// System.err.println("push " + files[i]);
|
||||
stack.push(files[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void push(File f) {
|
||||
push(f.listFiles(new FileFilter() {
|
||||
public boolean accept(File f) { return f.isDirectory(); } }));
|
||||
push(f.listFiles(new FileFilter() {
|
||||
public boolean accept(File f) { return f.getName().endsWith(".txt"); } }));
|
||||
find();
|
||||
public Iterator(File f) {
|
||||
push(f);
|
||||
}
|
||||
|
||||
void find() {
|
||||
|
@ -110,18 +101,38 @@ public class DirDocMaker extends BasicDocMaker {
|
|||
push(f);
|
||||
}
|
||||
|
||||
public Iterator(File f) {
|
||||
push(f);
|
||||
void push(File f) {
|
||||
push(f.listFiles(new FileFilter() {
|
||||
|
||||
public boolean accept(File file) {
|
||||
return file.isDirectory();
|
||||
}
|
||||
}));
|
||||
push(f.listFiles(new FileFilter() {
|
||||
|
||||
public boolean accept(File file) {
|
||||
return file.getName().endsWith(".txt");
|
||||
}
|
||||
}));
|
||||
find();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new RuntimeException("cannot");
|
||||
void push(File[] files) {
|
||||
Arrays.sort(files, c);
|
||||
for(int i = 0; i < files.length; i++) {
|
||||
// System.err.println("push " + files[i]);
|
||||
stack.push(files[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public int getCount(){
|
||||
return count;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return stack.size() > 0;
|
||||
}
|
||||
|
||||
|
||||
public Object next() {
|
||||
assert hasNext();
|
||||
count++;
|
||||
|
@ -131,42 +142,44 @@ public class DirDocMaker extends BasicDocMaker {
|
|||
return object;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected Iterator inputFiles = null;
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see SimpleDocMaker#setConfig(java.util.Properties)
|
||||
*/
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
String d = config.get("docs.dir", "dir-out");
|
||||
dataDir = new File(d);
|
||||
if (!dataDir.isAbsolute()) {
|
||||
dataDir = new File(new File("work"), d);
|
||||
public void remove() {
|
||||
throw new RuntimeException("cannot");
|
||||
}
|
||||
|
||||
inputFiles = new Iterator(dataDir);
|
||||
|
||||
if (inputFiles==null) {
|
||||
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
private ThreadLocal dateFormat = new ThreadLocal();
|
||||
private File dataDir = null;
|
||||
private int iteration = 0;
|
||||
private Iterator inputFiles = null;
|
||||
|
||||
// get/initiate a thread-local simple date format (must do so
|
||||
// because SimpleDateFormat is not thread-safe).
|
||||
protected DateFormat getDateFormat () {
|
||||
DateFormat df = (DateFormat) dateFormat.get();
|
||||
if (df == null) {
|
||||
private DateFormatInfo getDateFormatInfo() {
|
||||
DateFormatInfo dfi = (DateFormatInfo) dateFormat.get();
|
||||
if (dfi == null) {
|
||||
dfi = new DateFormatInfo();
|
||||
dfi.pos = new ParsePosition(0);
|
||||
// date format: 30-MAR-1987 14:22:36.87
|
||||
df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
|
||||
df.setLenient(true);
|
||||
dateFormat.set(df);
|
||||
dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS", Locale.US);
|
||||
dfi.df.setLenient(true);
|
||||
dateFormat.set(dfi);
|
||||
}
|
||||
return df;
|
||||
return dfi;
|
||||
}
|
||||
|
||||
protected DocData getNextDocData() throws Exception {
|
||||
private Date parseDate(String dateStr) {
|
||||
DateFormatInfo dfi = getDateFormatInfo();
|
||||
dfi.pos.setIndex(0);
|
||||
dfi.pos.setErrorIndex(-1);
|
||||
return dfi.df.parse(dateStr.trim(), dfi.pos);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
inputFiles = null;
|
||||
}
|
||||
|
||||
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
|
||||
File f = null;
|
||||
String name = null;
|
||||
synchronized (this) {
|
||||
|
@ -197,27 +210,37 @@ public class DirDocMaker extends BasicDocMaker {
|
|||
reader.close();
|
||||
addBytes(f.length());
|
||||
|
||||
Date date = getDateFormat().parse(dateStr.trim());
|
||||
return new DocData(name, bodyBuf.toString(), title, null, date);
|
||||
Date date = parseDate(dateStr);
|
||||
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setBody(bodyBuf.toString());
|
||||
docData.setTitle(title);
|
||||
docData.setDate(date);
|
||||
return docData;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#resetIinputs()
|
||||
*/
|
||||
public synchronized void resetInputs() {
|
||||
|
||||
public synchronized void resetInputs() throws IOException {
|
||||
super.resetInputs();
|
||||
inputFiles = new Iterator(dataDir);
|
||||
iteration = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#numUniqueTexts()
|
||||
*/
|
||||
public int numUniqueTexts() {
|
||||
return inputFiles.getCount();
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
|
||||
File workDir = new File(config.get("work.dir", "work"));
|
||||
String d = config.get("docs.dir", "dir-out");
|
||||
dataDir = new File(d);
|
||||
if (!dataDir.isAbsolute()) {
|
||||
dataDir = new File(workDir, d);
|
||||
}
|
||||
|
||||
inputFiles = new Iterator(dataDir);
|
||||
|
||||
if (inputFiles == null) {
|
||||
throw new RuntimeException("No txt files in dataDir: " + dataDir.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -20,94 +20,77 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
import java.util.Date;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Output of parsing (e.g. HTML parsing) of an input document.
|
||||
*/
|
||||
import org.apache.lucene.document.DateTools;
|
||||
|
||||
/** Output of parsing (e.g. HTML parsing) of an input document. */
|
||||
public class DocData {
|
||||
|
||||
private String name;
|
||||
private String body;
|
||||
private String title;
|
||||
private Date date;
|
||||
private String date;
|
||||
private Properties props;
|
||||
|
||||
public DocData(String name, String body, String title, Properties props, Date date) {
|
||||
this.name = name;
|
||||
this.body = body;
|
||||
this.title = title;
|
||||
this.date = date;
|
||||
this.props = props;
|
||||
public void clear() {
|
||||
name = null;
|
||||
body = null;
|
||||
title = null;
|
||||
date = null;
|
||||
props = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the name.
|
||||
*/
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name The name to set.
|
||||
*/
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the props.
|
||||
*/
|
||||
public Properties getProps() {
|
||||
return props;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param props The props to set.
|
||||
*/
|
||||
public void setProps(Properties props) {
|
||||
this.props = props;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the body.
|
||||
*/
|
||||
|
||||
public String getBody() {
|
||||
return body;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param body The body to set.
|
||||
* @return the date. If the ctor with Date was called, then the String
|
||||
* returned is the output of
|
||||
* {@link DateTools#dateToString(Date, org.apache.lucene.document.DateTools.Resolution)}
|
||||
* . Otherwise it's the String passed to the other ctor.
|
||||
*/
|
||||
public void setBody(String body) {
|
||||
this.body = body;
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public Properties getProps() {
|
||||
return props;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the title.
|
||||
*/
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param title The title to set.
|
||||
*/
|
||||
public void setBody(String body) {
|
||||
this.body = body;
|
||||
}
|
||||
|
||||
public void setDate(Date date) {
|
||||
if (date != null) {
|
||||
setDate(DateTools.dateToString(date, DateTools.Resolution.SECOND));
|
||||
} else {
|
||||
this.date = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public void setProps(Properties props) {
|
||||
this.props = props;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the date.
|
||||
*/
|
||||
public Date getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param date The date to set.
|
||||
*/
|
||||
public void setDate(Date date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,55 +17,373 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Format;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Index;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.Field.TermVector;
|
||||
|
||||
/**
|
||||
* Create documents for the test.
|
||||
* <br>Each call to makeDocument would create the next document.
|
||||
* When input is exhausted, the DocMaker iterates over the input again,
|
||||
* providing a source for unlimited number of documents,
|
||||
* though not all of them are unique.
|
||||
* Creates {@link Document} objects. Uses a {@link ContentSource} to generate
|
||||
* {@link DocData} objects. Supports the following parameters:
|
||||
* <ul>
|
||||
* <li><b>content.source</b> - specifies the {@link ContentSource} class to use
|
||||
* (default <b>SingleDocSource</b>).
|
||||
* <li><b>doc.stored</b> - specifies whether fields should be stored (default
|
||||
* <b>false</b>).
|
||||
* <li><b>doc.tokenized</b> - specifies whether fields should be tokenized
|
||||
* (default <b>true</b>).
|
||||
* <li><b>doc.term.vector</b> - specifies whether term vectors should be stored
|
||||
* for fields (default <b>false</b>).
|
||||
* <li><b>doc.term.vector.positions</b> - specifies whether term vectors should
|
||||
* be stored with positions (default <b>false</b>).
|
||||
* <li><b>doc.term.vector.offsets</b> - specifies whether term vectors should be
|
||||
* stored with offsets (default <b>false</b>).
|
||||
* <li><b>doc.store.body.bytes</b> - specifies whether to store the raw bytes of
|
||||
* the document's content in the document (default <b>false</b>).
|
||||
* <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects
|
||||
* should be reused (default <b>true</b>).
|
||||
* </ul>
|
||||
*/
|
||||
public interface DocMaker {
|
||||
public class DocMaker {
|
||||
|
||||
/**
|
||||
* Create the next document, of the given size by input bytes.
|
||||
* If the implementation does not support control over size, an exception is thrown.
|
||||
* @param size size of document, or 0 if there is no size requirement.
|
||||
* @exception if cannot make the document, or if size>0 was specified but this feature is not supported.
|
||||
private static class LeftOver {
|
||||
private DocData docdata;
|
||||
private int cnt;
|
||||
}
|
||||
|
||||
static class DocState {
|
||||
|
||||
private Map fields;
|
||||
private boolean reuseFields;
|
||||
Document doc;
|
||||
DocData docData = new DocData();
|
||||
|
||||
public DocState(boolean reuseFields, Store store, Index index, TermVector termVector) {
|
||||
|
||||
this.reuseFields = reuseFields;
|
||||
|
||||
if (reuseFields) {
|
||||
fields = new HashMap();
|
||||
|
||||
// Initialize the map with the default fields.
|
||||
fields.put(BODY_FIELD, new Field(BODY_FIELD, "", store, index, termVector));
|
||||
fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", store, index, termVector));
|
||||
fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector));
|
||||
fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
|
||||
fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
|
||||
|
||||
doc = new Document();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a field corresponding to the field name. If
|
||||
* <code>reuseFields</code> was set to true, then it attempts to reuse a
|
||||
* Field instance. If such a field does not exist, it creates a new one.
|
||||
*/
|
||||
Field getField(String name, Store store, Index index, TermVector termVector) {
|
||||
if (!reuseFields) {
|
||||
return new Field(name, "", store, index, termVector);
|
||||
}
|
||||
|
||||
Field f = (Field) fields.get(name);
|
||||
if (f == null) {
|
||||
f = new Field(name, "", store, index, termVector);
|
||||
fields.put(name, f);
|
||||
}
|
||||
return f;
|
||||
}
|
||||
}
|
||||
|
||||
private int numDocsCreated = 0;
|
||||
private boolean storeBytes = false;
|
||||
|
||||
// leftovers are thread local, because it is unsafe to share residues between threads
|
||||
private ThreadLocal leftovr = new ThreadLocal();
|
||||
private ThreadLocal docState = new ThreadLocal();
|
||||
|
||||
public static final String BODY_FIELD = "body";
|
||||
public static final String TITLE_FIELD = "doctitle";
|
||||
public static final String DATE_FIELD = "docdate";
|
||||
public static final String ID_FIELD = "docid";
|
||||
public static final String BYTES_FIELD = "bytes";
|
||||
public static final String NAME_FIELD = "docname";
|
||||
|
||||
protected Config config;
|
||||
|
||||
protected Store storeVal = Store.NO;
|
||||
protected Index indexVal = Index.ANALYZED;
|
||||
protected TermVector termVecVal = TermVector.NO;
|
||||
|
||||
protected ContentSource source;
|
||||
protected boolean reuseFields;
|
||||
protected DocState localDocState;
|
||||
|
||||
private int lastPrintedNumUniqueTexts = 0;
|
||||
|
||||
private long lastPrintedNumUniqueBytes = 0;
|
||||
|
||||
private int printNum = 0;
|
||||
|
||||
// create a doc
|
||||
// use only part of the body, modify it to keep the rest (or use all if size==0).
|
||||
// reset the docdata properties so they are not added more than once.
|
||||
private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
|
||||
int docid = incrNumDocsCreated();
|
||||
DocState ds = reuseFields ? getDocState() : localDocState;
|
||||
Document doc = reuseFields ? ds.doc : new Document();
|
||||
doc.getFields().clear();
|
||||
|
||||
// Set ID_FIELD
|
||||
Field idField = ds.getField(ID_FIELD, storeVal, indexVal, termVecVal);
|
||||
idField.setValue("doc" + docid);
|
||||
doc.add(idField);
|
||||
|
||||
// Set NAME_FIELD
|
||||
String name = docData.getName();
|
||||
if (name == null) name = "";
|
||||
name = cnt < 0 ? name : name + "_" + cnt;
|
||||
Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal);
|
||||
nameField.setValue(name);
|
||||
doc.add(nameField);
|
||||
|
||||
// Set DATE_FIELD
|
||||
String date = docData.getDate();
|
||||
if (date == null) {
|
||||
date = "";
|
||||
}
|
||||
Field dateField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
|
||||
dateField.setValue(date);
|
||||
doc.add(dateField);
|
||||
|
||||
// Set TITLE_FIELD
|
||||
String title = docData.getTitle();
|
||||
Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
|
||||
titleField.setValue(title == null ? "" : title);
|
||||
doc.add(titleField);
|
||||
|
||||
String body = docData.getBody();
|
||||
if (body != null && body.length() > 0) {
|
||||
String bdy;
|
||||
if (size <= 0 || size >= body.length()) {
|
||||
bdy = body; // use all
|
||||
docData.setBody(""); // nothing left
|
||||
} else {
|
||||
// attempt not to break words - if whitespace found within next 20 chars...
|
||||
for (int n = size - 1; n < size + 20 && n < body.length(); n++) {
|
||||
if (Character.isWhitespace(body.charAt(n))) {
|
||||
size = n;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bdy = body.substring(0, size); // use part
|
||||
docData.setBody(body.substring(size)); // some left
|
||||
}
|
||||
Field bodyField = ds.getField(BODY_FIELD, storeVal, indexVal, termVecVal);
|
||||
bodyField.setValue(bdy);
|
||||
doc.add(bodyField);
|
||||
|
||||
if (storeBytes) {
|
||||
Field bytesField = ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
|
||||
bytesField.setValue(bdy.getBytes("UTF-8"));
|
||||
doc.add(bytesField);
|
||||
}
|
||||
}
|
||||
|
||||
Properties props = docData.getProps();
|
||||
if (props != null) {
|
||||
for (Iterator iterator = props.entrySet().iterator(); iterator.hasNext();) {
|
||||
Entry entry = (Entry) iterator.next();
|
||||
Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal);
|
||||
f.setValue((String) entry.getValue());
|
||||
doc.add(f);
|
||||
}
|
||||
docData.setProps(null);
|
||||
}
|
||||
//System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
|
||||
return doc;
|
||||
}
|
||||
|
||||
private void resetLeftovers() {
|
||||
leftovr.set(null);
|
||||
}
|
||||
|
||||
protected DocState getDocState() {
|
||||
DocState ds = (DocState) docState.get();
|
||||
if (ds == null) {
|
||||
ds = new DocState(true, storeVal, indexVal, termVecVal);
|
||||
docState.set(ds);
|
||||
}
|
||||
return ds;
|
||||
}
|
||||
|
||||
protected synchronized int incrNumDocsCreated() {
|
||||
return numDocsCreated++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the {@link DocMaker}. The base implementation closes the
|
||||
* {@link ContentSource}, and it can be overridden to do more work (but make
|
||||
* sure to call super.close()).
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
source.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of bytes generated by the content source since last
|
||||
* reset.
|
||||
*/
|
||||
public synchronized long getBytesCount() {
|
||||
return source.getBytesCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the total number of bytes that were generated by the content source
|
||||
* defined to that doc maker.
|
||||
*/
|
||||
public Document makeDocument (int size) throws Exception;
|
||||
public long getTotalBytesCount() {
|
||||
return source.getTotalBytesCount();
|
||||
}
|
||||
|
||||
/** Create the next document. */
|
||||
public Document makeDocument () throws Exception;
|
||||
/**
|
||||
* Creates a {@link Document} object ready for indexing. This method uses the
|
||||
* {@link ContentSource} to get the next document from the source, and creates
|
||||
* a {@link Document} object from the returned fields. If
|
||||
* <code>reuseFields</code> was set to true, it will reuse {@link Document}
|
||||
* and {@link Field} instances.
|
||||
*/
|
||||
public Document makeDocument() throws Exception {
|
||||
resetLeftovers();
|
||||
DocData docData = source.getNextDocData(reuseFields ? getDocState().docData : localDocState.docData);
|
||||
Document doc = createDocument(docData, 0, -1);
|
||||
return doc;
|
||||
}
|
||||
|
||||
/** Set the properties */
|
||||
public void setConfig (Config config);
|
||||
/**
|
||||
* Same as {@link #makeDocument()}, only this method creates a document of the
|
||||
* given size input by <code>size</code>.
|
||||
*/
|
||||
public Document makeDocument(int size) throws Exception {
|
||||
LeftOver lvr = (LeftOver) leftovr.get();
|
||||
if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null
|
||||
|| lvr.docdata.getBody().length() == 0) {
|
||||
resetLeftovers();
|
||||
}
|
||||
DocData docData = reuseFields ? getDocState().docData : localDocState.docData;
|
||||
DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata);
|
||||
int cnt = (lvr == null ? 0 : lvr.cnt);
|
||||
while (dd.getBody() == null || dd.getBody().length() < size) {
|
||||
DocData dd2 = dd;
|
||||
dd = source.getNextDocData(new DocData());
|
||||
cnt = 0;
|
||||
dd.setBody(dd2.getBody() + dd.getBody());
|
||||
}
|
||||
Document doc = createDocument(dd, size, cnt);
|
||||
if (dd.getBody() == null || dd.getBody().length() == 0) {
|
||||
resetLeftovers();
|
||||
} else {
|
||||
if (lvr == null) {
|
||||
lvr = new LeftOver();
|
||||
leftovr.set(lvr);
|
||||
}
|
||||
lvr.docdata = dd;
|
||||
lvr.cnt = ++cnt;
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
public void printDocStatistics() {
|
||||
boolean print = false;
|
||||
String col = " ";
|
||||
StringBuffer sb = new StringBuffer();
|
||||
String newline = System.getProperty("line.separator");
|
||||
sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline);
|
||||
int nut = source.getTotalDocsCount();
|
||||
if (nut > lastPrintedNumUniqueTexts) {
|
||||
print = true;
|
||||
sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline);
|
||||
lastPrintedNumUniqueTexts = nut;
|
||||
}
|
||||
long nub = getTotalBytesCount();
|
||||
if (nub > lastPrintedNumUniqueBytes) {
|
||||
print = true;
|
||||
sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
|
||||
lastPrintedNumUniqueBytes = nub;
|
||||
}
|
||||
if (source.getDocsCount() > 0) {
|
||||
print = true;
|
||||
sb.append("num docs added since last inputs reset: ").append(Format.format(0,source.getDocsCount(),col)).append(newline);
|
||||
sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getBytesCount(),col)).append(newline);
|
||||
}
|
||||
if (print) {
|
||||
System.out.println(sb.append(newline).toString());
|
||||
printNum++;
|
||||
}
|
||||
}
|
||||
|
||||
/** Reset inputs so that the test run would behave, input wise, as if it just started. */
|
||||
public void resetInputs();
|
||||
public synchronized void resetInputs() throws IOException {
|
||||
printDocStatistics();
|
||||
// re-initiate since properties by round may have changed.
|
||||
setConfig(config);
|
||||
source.resetInputs();
|
||||
numDocsCreated = 0;
|
||||
resetLeftovers();
|
||||
}
|
||||
|
||||
/** Return how many real unique texts are available, 0 if not applicable. */
|
||||
public int numUniqueTexts();
|
||||
|
||||
/** Return total bytes of all available unique texts, 0 if not applicable */
|
||||
public long numUniqueBytes();
|
||||
/** Set the configuration parameters of this doc maker. */
|
||||
public void setConfig(Config config) {
|
||||
this.config = config;
|
||||
try {
|
||||
String sourceClass = config.get("content.source", "org.apache.lucene.benchmark.byTask.feeds.SingleDocSource");
|
||||
source = (ContentSource) Class.forName(sourceClass).newInstance();
|
||||
source.setConfig(config);
|
||||
} catch (Exception e) {
|
||||
// Should not get here. Throw runtime exception.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
/** Return number of docs made since last reset. */
|
||||
public int getCount();
|
||||
boolean stored = config.get("doc.stored", false);
|
||||
boolean tokenized = config.get("doc.tokenized", true);
|
||||
boolean termVec = config.get("doc.term.vector", false);
|
||||
storeVal = (stored ? Field.Store.YES : Field.Store.NO);
|
||||
indexVal = (tokenized ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED);
|
||||
boolean termVecPositions = config.get("doc.term.vector.positions", false);
|
||||
boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
|
||||
if (termVecPositions && termVecOffsets) {
|
||||
termVecVal = TermVector.WITH_POSITIONS_OFFSETS;
|
||||
} else if (termVecPositions) {
|
||||
termVecVal = TermVector.WITH_POSITIONS;
|
||||
} else if (termVecOffsets) {
|
||||
termVecVal = TermVector.WITH_OFFSETS;
|
||||
} else if (termVec) {
|
||||
termVecVal = TermVector.YES;
|
||||
} else {
|
||||
termVecVal = TermVector.NO;
|
||||
}
|
||||
storeBytes = config.get("doc.store.body.bytes", false);
|
||||
|
||||
reuseFields = config.get("doc.reuse.fields", true);
|
||||
if (!reuseFields) {
|
||||
localDocState = new DocState(false, storeVal, indexVal, termVecVal);
|
||||
} else {
|
||||
// In a multi-rounds run, it is important to reset DocState since settings
|
||||
// of fields may change between rounds, and this is the only way to reset
|
||||
// the cache of all threads.
|
||||
docState = new ThreadLocal();
|
||||
}
|
||||
}
|
||||
|
||||
/** Return total byte size of docs made since last reset. */
|
||||
public long getByteCount();
|
||||
|
||||
/** Print some statistics on docs available/added/etc. */
|
||||
public void printDocStatistics();
|
||||
|
||||
/** Set the html parser to use, when appropriate */
|
||||
public void setHTMLParser(HTMLParser htmlParser);
|
||||
|
||||
/** Returns the htmlParser. */
|
||||
public HTMLParser getHtmlParser();
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,294 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.XMLReader;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
import org.xml.sax.helpers.XMLReaderFactory;
|
||||
|
||||
/**
|
||||
* A {@link ContentSource} which reads the English Wikipedia dump. You can read
|
||||
* the .bz2 file directly (it will be decompressed on the fly). Config
|
||||
* properties:
|
||||
* <ul>
|
||||
* <li>keep.image.only.docs=false|true (default <b>true</b>).
|
||||
* <li>docs.file=<path to the file>
|
||||
* </ul>
|
||||
*/
|
||||
public class EnwikiContentSource extends ContentSource {
|
||||
|
||||
private class Parser extends DefaultHandler implements Runnable {
|
||||
private Thread t;
|
||||
private boolean threadDone;
|
||||
private String[] tuple;
|
||||
private NoMoreDataException nmde;
|
||||
private StringBuffer contents = new StringBuffer();
|
||||
private String title;
|
||||
private String body;
|
||||
private String time;
|
||||
private String id;
|
||||
|
||||
String[] next() throws NoMoreDataException {
|
||||
if (t == null) {
|
||||
threadDone = false;
|
||||
t = new Thread(this);
|
||||
t.setDaemon(true);
|
||||
t.start();
|
||||
}
|
||||
String[] result;
|
||||
synchronized(this){
|
||||
while(tuple == null && nmde == null && !threadDone) {
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
}
|
||||
}
|
||||
if (nmde != null) {
|
||||
// Set to null so we will re-start thread in case
|
||||
// we are re-used:
|
||||
t = null;
|
||||
throw nmde;
|
||||
}
|
||||
if (t != null && threadDone) {
|
||||
// The thread has exited yet did not hit end of
|
||||
// data, so this means it hit an exception. We
|
||||
// throw NoMorDataException here to force
|
||||
// benchmark to stop the current alg:
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
result = tuple;
|
||||
tuple = null;
|
||||
notify();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
String time(String original) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
|
||||
buffer.append(original.substring(8, 10));
|
||||
buffer.append('-');
|
||||
buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
|
||||
buffer.append('-');
|
||||
buffer.append(original.substring(0, 4));
|
||||
buffer.append(' ');
|
||||
buffer.append(original.substring(11, 19));
|
||||
buffer.append(".000");
|
||||
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
public void characters(char[] ch, int start, int length) {
|
||||
contents.append(ch, start, length);
|
||||
}
|
||||
|
||||
public void endElement(String namespace, String simple, String qualified)
|
||||
throws SAXException {
|
||||
int elemType = getElementType(qualified);
|
||||
switch (elemType) {
|
||||
case PAGE:
|
||||
// the body must be null and we either are keeping image docs or the
|
||||
// title does not start with Image:
|
||||
if (body != null && (keepImages || !title.startsWith("Image:"))) {
|
||||
String[] tmpTuple = new String[LENGTH];
|
||||
tmpTuple[TITLE] = title.replace('\t', ' ');
|
||||
tmpTuple[DATE] = time.replace('\t', ' ');
|
||||
tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
|
||||
tmpTuple[ID] = id;
|
||||
synchronized(this) {
|
||||
while (tuple != null) {
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
}
|
||||
}
|
||||
tuple = tmpTuple;
|
||||
notify();
|
||||
}
|
||||
}
|
||||
break;
|
||||
case BODY:
|
||||
body = contents.toString();
|
||||
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
|
||||
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
|
||||
if (startsWith.startsWith("#redirect")) {
|
||||
body = null;
|
||||
}
|
||||
break;
|
||||
case DATE:
|
||||
time = time(contents.toString());
|
||||
break;
|
||||
case TITLE:
|
||||
title = contents.toString();
|
||||
break;
|
||||
case ID:
|
||||
id = contents.toString();
|
||||
break;
|
||||
default:
|
||||
// this element should be discarded.
|
||||
}
|
||||
}
|
||||
|
||||
public void run() {
|
||||
|
||||
try {
|
||||
XMLReader reader = XMLReaderFactory.createXMLReader();
|
||||
reader.setContentHandler(this);
|
||||
reader.setErrorHandler(this);
|
||||
while(true){
|
||||
final InputStream localFileIS = is;
|
||||
try {
|
||||
reader.parse(new InputSource(localFileIS));
|
||||
} catch (IOException ioe) {
|
||||
synchronized(EnwikiContentSource.this) {
|
||||
if (localFileIS != is) {
|
||||
// fileIS was closed on us, so, just fall
|
||||
// through
|
||||
} else
|
||||
// Exception is real
|
||||
throw ioe;
|
||||
}
|
||||
}
|
||||
synchronized(this) {
|
||||
if (!forever) {
|
||||
nmde = new NoMoreDataException();
|
||||
notify();
|
||||
return;
|
||||
} else if (localFileIS == is) {
|
||||
// If file is not already re-opened then re-open it now
|
||||
is = getInputStream(file);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (SAXException sae) {
|
||||
throw new RuntimeException(sae);
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
} finally {
|
||||
synchronized(this) {
|
||||
threadDone = true;
|
||||
notify();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void startElement(String namespace, String simple, String qualified,
|
||||
Attributes attributes) {
|
||||
int elemType = getElementType(qualified);
|
||||
switch (elemType) {
|
||||
case PAGE:
|
||||
title = null;
|
||||
body = null;
|
||||
time = null;
|
||||
id = null;
|
||||
break;
|
||||
// intentional fall-through.
|
||||
case BODY:
|
||||
case DATE:
|
||||
case TITLE:
|
||||
case ID:
|
||||
contents.setLength(0);
|
||||
break;
|
||||
default:
|
||||
// this element should be discarded.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final Map ELEMENTS = new HashMap();
|
||||
private static final int TITLE = 0;
|
||||
private static final int DATE = TITLE + 1;
|
||||
private static final int BODY = DATE + 1;
|
||||
private static final int ID = BODY + 1;
|
||||
private static final int LENGTH = ID + 1;
|
||||
// LENGTH is used as the size of the tuple, so whatever constants we need that
|
||||
// should not be part of the tuple, we should define them after LENGTH.
|
||||
private static final int PAGE = LENGTH + 1;
|
||||
|
||||
private static final String[] months = {"JAN", "FEB", "MAR", "APR",
|
||||
"MAY", "JUN", "JUL", "AUG",
|
||||
"SEP", "OCT", "NOV", "DEC"};
|
||||
|
||||
static {
|
||||
ELEMENTS.put("page", Integer.valueOf(PAGE));
|
||||
ELEMENTS.put("text", Integer.valueOf(BODY));
|
||||
ELEMENTS.put("timestamp", Integer.valueOf(DATE));
|
||||
ELEMENTS.put("title", Integer.valueOf(TITLE));
|
||||
ELEMENTS.put("id", Integer.valueOf(ID));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the type of the element if defined, otherwise returns -1. This
|
||||
* method is useful in startElement and endElement, by not needing to compare
|
||||
* the element qualified name over and over.
|
||||
*/
|
||||
private final static int getElementType(String elem) {
|
||||
Integer val = (Integer) ELEMENTS.get(elem);
|
||||
return val == null ? -1 : val.intValue();
|
||||
}
|
||||
|
||||
private File file;
|
||||
private boolean keepImages = true;
|
||||
private InputStream is;
|
||||
private Parser parser = new Parser();
|
||||
|
||||
public void close() throws IOException {
|
||||
synchronized (EnwikiContentSource.this) {
|
||||
if (is != null) {
|
||||
is.close();
|
||||
is = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
|
||||
String[] tuple = parser.next();
|
||||
docData.clear();
|
||||
docData.setName(tuple[ID]);
|
||||
docData.setBody(tuple[BODY]);
|
||||
docData.setDate(tuple[DATE]);
|
||||
docData.setTitle(tuple[TITLE]);
|
||||
return docData;
|
||||
}
|
||||
|
||||
public void resetInputs() throws IOException {
|
||||
super.resetInputs();
|
||||
is = getInputStream(file);
|
||||
}
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
keepImages = config.get("keep.image.only.docs", true);
|
||||
String fileName = config.get("docs.file", null);
|
||||
if (fileName == null) {
|
||||
throw new IllegalArgumentException("docs.file must be set");
|
||||
}
|
||||
file = new File(fileName).getAbsoluteFile();
|
||||
}
|
||||
|
||||
}
|
|
@ -17,288 +17,54 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.XMLReader;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
import org.xml.sax.helpers.XMLReaderFactory;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Index;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.Field.TermVector;
|
||||
|
||||
/**
|
||||
* A {@link LineDocMaker} which reads the english wikipedia
|
||||
* dump. You can read the .bz2 file directly (it will be
|
||||
* decompressed on the fly).
|
||||
* Config properties:
|
||||
* <ul>
|
||||
* <li>keep.image.only.docs=false|true
|
||||
* <li>[those available in {@link LineDocMaker}]
|
||||
* </ul>
|
||||
*
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
|
||||
* A {@link DocMaker} which reads the English Wikipedia dump. Uses
|
||||
* {@link EnwikiContentSource} as its content source, regardless if a different
|
||||
* content source was defined in the configuration.
|
||||
*/
|
||||
public class EnwikiDocMaker extends LineDocMaker {
|
||||
public class EnwikiDocMaker extends DocMaker {
|
||||
|
||||
private static final Map ELEMENTS = new HashMap();
|
||||
|
||||
static final int TITLE = 0;
|
||||
static final int DATE = TITLE + 1;
|
||||
static final int BODY = DATE + 1;
|
||||
static final int ID = BODY + 1;
|
||||
static final int LENGTH = ID + 1;
|
||||
// LENGTH is used as the size of the tuple, so whatever constants we need that
|
||||
// should not be part of the tuple, we should define them after LENGTH.
|
||||
static final int PAGE = LENGTH + 1;
|
||||
|
||||
static final String[] months = {"JAN", "FEB", "MAR", "APR",
|
||||
"MAY", "JUN", "JUL", "AUG",
|
||||
"SEP", "OCT", "NOV", "DEC"};
|
||||
public Document makeDocument() throws Exception {
|
||||
DocState ds = reuseFields ? getDocState() : localDocState;
|
||||
DocData dd = source.getNextDocData(ds.docData);
|
||||
Document doc = reuseFields ? ds.doc : new Document();
|
||||
doc.getFields().clear();
|
||||
|
||||
static {
|
||||
ELEMENTS.put("page", new Integer(PAGE));
|
||||
ELEMENTS.put("text", new Integer(BODY));
|
||||
ELEMENTS.put("timestamp", new Integer(DATE));
|
||||
ELEMENTS.put("title", new Integer(TITLE));
|
||||
ELEMENTS.put("id", new Integer(ID));
|
||||
Field body = ds.getField(BODY_FIELD, storeVal, Index.ANALYZED, termVecVal);
|
||||
body.setValue(dd.getBody());
|
||||
doc.add(body);
|
||||
|
||||
Field title = ds.getField(TITLE_FIELD, storeVal, Index.ANALYZED, termVecVal);
|
||||
title.setValue(dd.getTitle());
|
||||
doc.add(title);
|
||||
|
||||
Field date = ds.getField(DATE_FIELD, storeVal, Index.ANALYZED, termVecVal);
|
||||
date.setValue(dd.getDate());
|
||||
doc.add(date);
|
||||
|
||||
Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
|
||||
id.setValue(dd.getName());
|
||||
doc.add(id);
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the type of the element if defined, otherwise returns -1. This
|
||||
* method is useful in startElement and endElement, by not needing to compare
|
||||
* the element qualified name over and over.
|
||||
*/
|
||||
private final static int getElementType(String elem) {
|
||||
Integer val = (Integer) ELEMENTS.get(elem);
|
||||
return val == null ? -1 : val.intValue();
|
||||
|
||||
public Document makeDocument(int size) throws Exception {
|
||||
throw new RuntimeException("cannot change document size with EnwikiDocMaker");
|
||||
}
|
||||
|
||||
protected boolean keepImages = true;
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
keepImages = config.get("keep.image.only.docs", true);
|
||||
// Override whatever content source was set in the config
|
||||
source = new EnwikiContentSource();
|
||||
source.setConfig(config);
|
||||
}
|
||||
|
||||
class Parser extends DefaultHandler implements Runnable {
|
||||
Thread t;
|
||||
boolean threadDone;
|
||||
|
||||
public void run() {
|
||||
|
||||
try {
|
||||
XMLReader reader =
|
||||
XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
|
||||
reader.setContentHandler(this);
|
||||
reader.setErrorHandler(this);
|
||||
while(true){
|
||||
final InputStream localFileIS = fileIS;
|
||||
try {
|
||||
InputSource is = new InputSource(localFileIS);
|
||||
reader.parse(is);
|
||||
} catch (IOException ioe) {
|
||||
synchronized(EnwikiDocMaker.this) {
|
||||
if (localFileIS != fileIS) {
|
||||
// fileIS was closed on us, so, just fall
|
||||
// through
|
||||
} else
|
||||
// Exception is real
|
||||
throw ioe;
|
||||
}
|
||||
}
|
||||
synchronized(this) {
|
||||
if (!forever) {
|
||||
nmde = new NoMoreDataException();
|
||||
notify();
|
||||
return;
|
||||
} else if (localFileIS == fileIS) {
|
||||
// If file is not already re-opened then
|
||||
// re-open it now
|
||||
openFile();
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (SAXException sae) {
|
||||
throw new RuntimeException(sae);
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
} finally {
|
||||
synchronized(this) {
|
||||
threadDone = true;
|
||||
notify();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String[] tuple;
|
||||
NoMoreDataException nmde;
|
||||
|
||||
String[] next() throws NoMoreDataException {
|
||||
if (t == null) {
|
||||
threadDone = false;
|
||||
t = new Thread(this);
|
||||
t.setDaemon(true);
|
||||
t.start();
|
||||
}
|
||||
String[] result;
|
||||
synchronized(this){
|
||||
while(tuple == null && nmde == null && !threadDone) {
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
}
|
||||
}
|
||||
if (nmde != null) {
|
||||
// Set to null so we will re-start thread in case
|
||||
// we are re-used:
|
||||
t = null;
|
||||
throw nmde;
|
||||
}
|
||||
if (t != null && threadDone) {
|
||||
// The thread has exited yet did not hit end of
|
||||
// data, so this means it hit an exception. We
|
||||
// throw NoMorDataException here to force
|
||||
// benchmark to stop the current alg:
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
result = tuple;
|
||||
tuple = null;
|
||||
notify();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
StringBuffer contents = new StringBuffer();
|
||||
|
||||
public void characters(char[] ch, int start, int length) {
|
||||
contents.append(ch, start, length);
|
||||
}
|
||||
|
||||
String title;
|
||||
String body;
|
||||
String time;
|
||||
String id;
|
||||
|
||||
public void startElement(String namespace,
|
||||
String simple,
|
||||
String qualified,
|
||||
Attributes attributes) {
|
||||
int elemType = getElementType(qualified);
|
||||
switch (elemType) {
|
||||
case PAGE:
|
||||
title = null;
|
||||
body = null;
|
||||
time = null;
|
||||
id = null;
|
||||
break;
|
||||
// intentional fall-through.
|
||||
case BODY:
|
||||
case DATE:
|
||||
case TITLE:
|
||||
case ID:
|
||||
contents.setLength(0);
|
||||
break;
|
||||
default:
|
||||
// this element should be discarded.
|
||||
}
|
||||
}
|
||||
|
||||
String time(String original) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
|
||||
buffer.append(original.substring(8, 10));
|
||||
buffer.append('-');
|
||||
buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
|
||||
buffer.append('-');
|
||||
buffer.append(original.substring(0, 4));
|
||||
buffer.append(' ');
|
||||
buffer.append(original.substring(11, 19));
|
||||
buffer.append(".000");
|
||||
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
public void create(String title, String time, String body, String id) {
|
||||
String[] t = new String[LENGTH];
|
||||
t[TITLE] = title.replace('\t', ' ');
|
||||
t[DATE] = time.replace('\t', ' ');
|
||||
t[BODY] = body.replaceAll("[\t\n]", " ");
|
||||
t[ID] = id;
|
||||
synchronized(this) {
|
||||
while(tuple!=null) {
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
}
|
||||
}
|
||||
tuple = t;
|
||||
notify();
|
||||
}
|
||||
}
|
||||
|
||||
public void endElement(String namespace, String simple, String qualified)
|
||||
throws SAXException {
|
||||
int elemType = getElementType(qualified);
|
||||
switch (elemType) {
|
||||
case PAGE:
|
||||
// the body must be null and we either are keeping image docs or the
|
||||
// title does not start with Image:
|
||||
if (body != null && (keepImages || !title.startsWith("Image:"))) {
|
||||
create(title, time, body, id);
|
||||
}
|
||||
break;
|
||||
case BODY:
|
||||
body = contents.toString();
|
||||
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
|
||||
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
|
||||
if (startsWith.startsWith("#redirect")) {
|
||||
body = null;
|
||||
}
|
||||
break;
|
||||
case DATE:
|
||||
time = time(contents.toString());
|
||||
break;
|
||||
case TITLE:
|
||||
title = contents.toString();
|
||||
break;
|
||||
case ID:
|
||||
id = contents.toString();
|
||||
break;
|
||||
default:
|
||||
// this element should be discarded.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Parser parser = new Parser();
|
||||
|
||||
class DocState extends LineDocMaker.DocState {
|
||||
public Document setFields(String[] tuple) {
|
||||
titleField.setValue(tuple[TITLE]);
|
||||
dateField.setValue(tuple[DATE]);
|
||||
bodyField.setValue(tuple[BODY]);
|
||||
idField.setValue(tuple[ID]);
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
||||
private DocState getDocState() {
|
||||
DocState ds = (DocState) docState.get();
|
||||
if (ds == null) {
|
||||
ds = new DocState();
|
||||
docState.set(ds);
|
||||
}
|
||||
return ds;
|
||||
}
|
||||
|
||||
public Document makeDocument() throws Exception {
|
||||
String[] tuple = parser.next();
|
||||
return getDocState().setFields(tuple);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -46,7 +46,7 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake
|
|||
|
||||
Analyzer anlzr = (Analyzer) Class.forName(config.get("analyzer",
|
||||
"org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance();
|
||||
String defaultField = config.get("file.query.maker.default.field", BasicDocMaker.BODY_FIELD);
|
||||
String defaultField = config.get("file.query.maker.default.field", DocMaker.BODY_FIELD);
|
||||
QueryParser qp = new QueryParser(defaultField, anlzr);
|
||||
|
||||
List qq = new ArrayList();
|
||||
|
@ -55,8 +55,7 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake
|
|||
{
|
||||
File file = new File(fileName);
|
||||
Reader reader = null;
|
||||
if (file != null && file.exists())
|
||||
{
|
||||
if (file.exists()) {
|
||||
reader = new FileReader(file);
|
||||
} else {
|
||||
//see if we can find it as a resource
|
||||
|
|
|
@ -39,13 +39,13 @@ public interface HTMLParser {
|
|||
* @throws IOException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
|
||||
public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
|
||||
|
||||
/**
|
||||
* Parse the inputText and return DocData.
|
||||
* @param inputText the html text to parse.
|
||||
* @see #parse(String, Date, Reader, DateFormat)
|
||||
*/
|
||||
public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException;
|
||||
public DocData parse(DocData docData, String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException;
|
||||
|
||||
}
|
||||
|
|
|
@ -17,246 +17,76 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorException;
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Index;
|
||||
import org.apache.lucene.document.Field.Store;
|
||||
import org.apache.lucene.document.Field.TermVector;
|
||||
|
||||
/**
|
||||
* A DocMaker reading one line at a time as a Document from a single file. This
|
||||
* saves IO cost (over DirDocMaker) of recursing through a directory and opening
|
||||
* a new file for every document. It also re-uses its Document and Field
|
||||
* saves IO cost (over DirContentSource) of recursing through a directory and
|
||||
* opening a new file for every document. It also re-uses its Document and Field
|
||||
* instance to improve indexing speed.<br>
|
||||
* The expected format of each line is (arguments are separated by <TAB>):
|
||||
* <i>title, date, body</i>. If a line is read in a different format, a
|
||||
* {@link RuntimeException} will be thrown. In general, you should use this doc
|
||||
* maker with files that were created with {@link WriteLineDocTask}.<br><br>
|
||||
*
|
||||
* maker with files that were created with {@link WriteLineDocTask}.<br>
|
||||
* <br>
|
||||
* Config properties:
|
||||
* <ul>
|
||||
* <li>docs.file=<path to the file>
|
||||
* <li>doc.reuse.fields=true|false (default true)
|
||||
* <li>bzip.compression=true|false (default false)
|
||||
* <li>doc.random.id.limit=N (default -1) -- create random docid in the range
|
||||
* 0..N; this is useful with UpdateDoc to test updating random documents; if
|
||||
* this is unspecified or -1, then docid is sequentially assigned
|
||||
* </ul>
|
||||
*/
|
||||
public class LineDocMaker extends BasicDocMaker {
|
||||
public class LineDocMaker extends DocMaker {
|
||||
|
||||
InputStream fileIS;
|
||||
BufferedReader fileIn;
|
||||
ThreadLocal docState = new ThreadLocal();
|
||||
private String fileName;
|
||||
|
||||
private static int READER_BUFFER_BYTES = 64*1024;
|
||||
private final DocState localDocState = new DocState();
|
||||
|
||||
private boolean doReuseFields = true;
|
||||
private boolean bzipCompressionEnabled = false;
|
||||
private Random r;
|
||||
private int numDocs;
|
||||
|
||||
private CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
class DocState {
|
||||
Document doc;
|
||||
Field bodyField;
|
||||
Field titleField;
|
||||
Field dateField;
|
||||
Field idField;
|
||||
|
||||
public DocState() {
|
||||
|
||||
bodyField = new Field(BasicDocMaker.BODY_FIELD,
|
||||
"",
|
||||
storeVal,
|
||||
Field.Index.ANALYZED,
|
||||
termVecVal);
|
||||
titleField = new Field(BasicDocMaker.TITLE_FIELD,
|
||||
"",
|
||||
storeVal,
|
||||
Field.Index.ANALYZED,
|
||||
termVecVal);
|
||||
dateField = new Field(BasicDocMaker.DATE_FIELD,
|
||||
"",
|
||||
storeVal,
|
||||
Field.Index.ANALYZED,
|
||||
termVecVal);
|
||||
idField = new Field(BasicDocMaker.ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(bodyField);
|
||||
doc.add(titleField);
|
||||
doc.add(dateField);
|
||||
doc.add(idField);
|
||||
}
|
||||
|
||||
final static char SEP = WriteLineDocTask.SEP;
|
||||
|
||||
private int numDocsCreated;
|
||||
private synchronized int incrNumDocsCreated() {
|
||||
return numDocsCreated++;
|
||||
}
|
||||
|
||||
public Document setFields(String line) {
|
||||
// A line must be in the following format. If it's not, fail !
|
||||
// title <TAB> date <TAB> body <NEWLINE>
|
||||
int spot = line.indexOf(SEP);
|
||||
if (spot == -1) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||
}
|
||||
int spot2 = line.indexOf(SEP, 1 + spot);
|
||||
if (spot2 == -1) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||
}
|
||||
final String title = line.substring(0, spot);
|
||||
final String date = line.substring(1+spot, spot2);
|
||||
final String body = line.substring(1+spot2, line.length());
|
||||
final String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
|
||||
|
||||
if (doReuseFields) {
|
||||
idField.setValue(docID);
|
||||
titleField.setValue(title);
|
||||
dateField.setValue(date);
|
||||
bodyField.setValue(body);
|
||||
return doc;
|
||||
} else {
|
||||
Field localIDField = new Field(BasicDocMaker.ID_FIELD,
|
||||
docID,
|
||||
Field.Store.YES,
|
||||
Field.Index.NOT_ANALYZED_NO_NORMS);
|
||||
|
||||
Field localTitleField = new Field(BasicDocMaker.TITLE_FIELD,
|
||||
title,
|
||||
storeVal,
|
||||
Field.Index.ANALYZED,
|
||||
termVecVal);
|
||||
Field localBodyField = new Field(BasicDocMaker.BODY_FIELD,
|
||||
body,
|
||||
storeVal,
|
||||
Field.Index.ANALYZED,
|
||||
termVecVal);
|
||||
Field localDateField = new Field(BasicDocMaker.BODY_FIELD,
|
||||
date,
|
||||
storeVal,
|
||||
Field.Index.ANALYZED,
|
||||
termVecVal);
|
||||
Document localDoc = new Document();
|
||||
localDoc.add(localIDField);
|
||||
localDoc.add(localBodyField);
|
||||
localDoc.add(localTitleField);
|
||||
localDoc.add(localDateField);
|
||||
return localDoc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected DocData getNextDocData() throws Exception {
|
||||
throw new RuntimeException("not implemented");
|
||||
}
|
||||
|
||||
private DocState getDocState() {
|
||||
DocState ds = (DocState) docState.get();
|
||||
if (ds == null) {
|
||||
ds = new DocState();
|
||||
docState.set(ds);
|
||||
}
|
||||
return ds;
|
||||
}
|
||||
|
||||
public Document makeDocument() throws Exception {
|
||||
|
||||
String line;
|
||||
synchronized(this) {
|
||||
line = fileIn.readLine();
|
||||
if (line == null) {
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
// Reset the file
|
||||
openFile();
|
||||
return makeDocument();
|
||||
}
|
||||
}
|
||||
DocState ds = reuseFields ? getDocState() : localDocState;
|
||||
DocData dd = source.getNextDocData(ds.docData);
|
||||
Document doc = reuseFields ? ds.doc : new Document();
|
||||
doc.getFields().clear();
|
||||
|
||||
if (doReuseFields)
|
||||
return getDocState().setFields(line);
|
||||
else
|
||||
return localDocState.setFields(line);
|
||||
Field body = ds.getField(BODY_FIELD, storeVal, Index.ANALYZED, termVecVal);
|
||||
body.setValue(dd.getBody());
|
||||
doc.add(body);
|
||||
|
||||
Field title = ds.getField(TITLE_FIELD, storeVal, Index.ANALYZED, termVecVal);
|
||||
title.setValue(dd.getTitle());
|
||||
doc.add(title);
|
||||
|
||||
Field date = ds.getField(DATE_FIELD, storeVal, Index.ANALYZED, termVecVal);
|
||||
date.setValue(dd.getDate());
|
||||
doc.add(date);
|
||||
|
||||
String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
|
||||
Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
|
||||
id.setValue(docID);
|
||||
doc.add(id);
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
public Document makeDocument(int size) throws Exception {
|
||||
throw new RuntimeException("cannot change document size with LineDocMaker; please use DirDocMaker instead");
|
||||
throw new RuntimeException("cannot change document size with LineDocMaker");
|
||||
}
|
||||
|
||||
public synchronized void resetInputs() {
|
||||
super.resetInputs();
|
||||
openFile();
|
||||
}
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
fileName = config.get("docs.file", null);
|
||||
if (fileName == null) {
|
||||
throw new IllegalArgumentException("docs.file must be set");
|
||||
}
|
||||
doReuseFields = config.get("doc.reuse.fields", true);
|
||||
String doBZCompress = config.get("bzip.compression", null);
|
||||
if (doBZCompress != null) {
|
||||
// Property was set, use the value.
|
||||
bzipCompressionEnabled = Boolean.valueOf(doBZCompress).booleanValue();
|
||||
} else {
|
||||
// Property was not set, attempt to detect based on file's extension
|
||||
bzipCompressionEnabled = fileName.endsWith("bz2");
|
||||
}
|
||||
source = new LineDocSource();
|
||||
source.setConfig(config);
|
||||
numDocs = config.get("doc.random.id.limit", -1);
|
||||
if (numDocs != -1) {
|
||||
r = new Random(179);
|
||||
}
|
||||
}
|
||||
|
||||
synchronized void openFile() {
|
||||
try {
|
||||
if (fileIn != null) {
|
||||
fileIn.close();
|
||||
}
|
||||
fileIS = new FileInputStream(fileName);
|
||||
if (bzipCompressionEnabled) {
|
||||
// According to BZip2CompressorInputStream's code, it reads the first
|
||||
// two file header chars ('B' and 'Z'). We only need to wrap the
|
||||
// underlying stream with a BufferedInputStream, since the code uses
|
||||
// the read() method exclusively.
|
||||
fileIS = new BufferedInputStream(fileIS, READER_BUFFER_BYTES);
|
||||
fileIS = csFactory.createCompressorInputStream("bzip2", fileIS);
|
||||
}
|
||||
// Wrap the stream with a BufferedReader for several reasons:
|
||||
// 1. We need the readLine() method.
|
||||
// 2. Even if bzip.compression is enabled, and is wrapped with
|
||||
// BufferedInputStream, wrapping with a buffer can still improve
|
||||
// performance, since the BIS buffer will be used to read from the
|
||||
// compressed stream, while the BR buffer will be used to read from the
|
||||
// uncompressed stream.
|
||||
fileIn = new BufferedReader(new InputStreamReader(fileIS, "UTF-8"), READER_BUFFER_BYTES);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
} catch (CompressorException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public int numUniqueTexts() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
/**
|
||||
* A {@link ContentSource} reading one line at a time as a
|
||||
* {@link org.apache.lucene.document.Document} from a single file. This saves IO
|
||||
* cost (over DirContentSource) of recursing through a directory and opening a
|
||||
* new file for every document.<br>
|
||||
* The expected format of each line is (arguments are separated by <TAB>):
|
||||
* <i>title, date, body</i>. If a line is read in a different format, a
|
||||
* {@link RuntimeException} will be thrown. In general, you should use this
|
||||
* content source for files that were created with {@link WriteLineDocTask}.<br>
|
||||
* <br>
|
||||
* Config properties:
|
||||
* <ul>
|
||||
* <li>docs.file=<path to the file>
|
||||
* </ul>
|
||||
*/
|
||||
public class LineDocSource extends ContentSource {
|
||||
|
||||
private final static char SEP = WriteLineDocTask.SEP;
|
||||
|
||||
private File file;
|
||||
private BufferedReader reader;
|
||||
|
||||
private synchronized void openFile() {
|
||||
try {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
}
|
||||
InputStream is = getInputStream(file);
|
||||
reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
reader = null;
|
||||
}
|
||||
}
|
||||
|
||||
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
|
||||
String line;
|
||||
synchronized(this) {
|
||||
line = reader.readLine();
|
||||
if (line == null) {
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
// Reset the file
|
||||
openFile();
|
||||
return getNextDocData(docData);
|
||||
}
|
||||
}
|
||||
|
||||
// A line must be in the following format. If it's not, fail !
|
||||
// title <TAB> date <TAB> body <NEWLINE>
|
||||
int spot = line.indexOf(SEP);
|
||||
if (spot == -1) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||
}
|
||||
int spot2 = line.indexOf(SEP, 1 + spot);
|
||||
if (spot2 == -1) {
|
||||
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||
}
|
||||
// The date String was written in the format of DateTools.dateToString.
|
||||
docData.clear();
|
||||
docData.setBody(line.substring(1 + spot2, line.length()));
|
||||
docData.setTitle(line.substring(0, spot));
|
||||
docData.setDate(line.substring(1 + spot, spot2));
|
||||
return docData;
|
||||
}
|
||||
|
||||
public void resetInputs() throws IOException {
|
||||
super.resetInputs();
|
||||
openFile();
|
||||
}
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
String fileName = config.get("docs.file", null);
|
||||
if (fileName == null) {
|
||||
throw new IllegalArgumentException("docs.file must be set");
|
||||
}
|
||||
file = new File(fileName).getAbsoluteFile();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParsePosition;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
/**
|
||||
* A {@link ContentSource} reading from the Reuters collection.
|
||||
* <p>
|
||||
* Config properties:
|
||||
* <ul>
|
||||
* <li><b>work.dir</b> - path to the root of docs and indexes dirs (default
|
||||
* <b>work</b>).
|
||||
* <li><b>docs.dir</b> - path to the docs dir (default <b>reuters-out</b>).
|
||||
* </ul>
|
||||
*/
|
||||
public class ReutersContentSource extends ContentSource {
|
||||
|
||||
private static final class DateFormatInfo {
|
||||
DateFormat df;
|
||||
ParsePosition pos;
|
||||
}
|
||||
|
||||
private ThreadLocal dateFormat = new ThreadLocal();
|
||||
private File dataDir = null;
|
||||
private ArrayList inputFiles = new ArrayList();
|
||||
private int nextFile = 0;
|
||||
private int iteration = 0;
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
File workDir = new File(config.get("work.dir", "work"));
|
||||
String d = config.get("docs.dir", "reuters-out");
|
||||
dataDir = new File(d);
|
||||
if (!dataDir.isAbsolute()) {
|
||||
dataDir = new File(workDir, d);
|
||||
}
|
||||
inputFiles.clear();
|
||||
collectFiles(dataDir, inputFiles);
|
||||
if (inputFiles.size() == 0) {
|
||||
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized DateFormatInfo getDateFormatInfo() {
|
||||
DateFormatInfo dfi = (DateFormatInfo) dateFormat.get();
|
||||
if (dfi == null) {
|
||||
dfi = new DateFormatInfo();
|
||||
// date format: 30-MAR-1987 14:22:36.87
|
||||
dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
|
||||
dfi.df.setLenient(true);
|
||||
dfi.pos = new ParsePosition(0);
|
||||
dateFormat.set(dfi);
|
||||
}
|
||||
return dfi;
|
||||
}
|
||||
|
||||
private Date parseDate(String dateStr) {
|
||||
DateFormatInfo dfi = getDateFormatInfo();
|
||||
dfi.pos.setIndex(0);
|
||||
dfi.pos.setErrorIndex(-1);
|
||||
return dfi.df.parse(dateStr.trim(), dfi.pos);
|
||||
}
|
||||
|
||||
|
||||
public void close() throws IOException {
|
||||
// TODO implement?
|
||||
}
|
||||
|
||||
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
|
||||
File f = null;
|
||||
String name = null;
|
||||
synchronized (this) {
|
||||
if (nextFile >= inputFiles.size()) {
|
||||
// exhausted files, start a new round, unless forever set to false.
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
nextFile = 0;
|
||||
iteration++;
|
||||
}
|
||||
f = (File) inputFiles.get(nextFile++);
|
||||
name = f.getCanonicalPath() + "_" + iteration;
|
||||
}
|
||||
|
||||
BufferedReader reader = new BufferedReader(new FileReader(f));
|
||||
try {
|
||||
// First line is the date, 3rd is the title, rest is body
|
||||
String dateStr = reader.readLine();
|
||||
reader.readLine();// skip an empty line
|
||||
String title = reader.readLine();
|
||||
reader.readLine();// skip an empty line
|
||||
StringBuffer bodyBuf = new StringBuffer(1024);
|
||||
String line = null;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
bodyBuf.append(line).append(' ');
|
||||
}
|
||||
reader.close();
|
||||
|
||||
addBytes(f.length());
|
||||
|
||||
Date date = parseDate(dateStr.trim());
|
||||
|
||||
docData.clear();
|
||||
docData.setName(name);
|
||||
docData.setBody(bodyBuf.toString());
|
||||
docData.setTitle(title);
|
||||
docData.setDate(date);
|
||||
return docData;
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void resetInputs() throws IOException {
|
||||
super.resetInputs();
|
||||
nextFile = 0;
|
||||
iteration = 0;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,135 +0,0 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
|
||||
|
||||
/**
|
||||
* A DocMaker using the Reuters collection for its input.
|
||||
* <p>
|
||||
* Config properties:<ul>
|
||||
* <li>work.dir=<path to the root of docs and indexes dirs| Default: work></li>
|
||||
* <li>docs.dir=<path to the docs dir| Default: reuters-out></li>
|
||||
* </ul>
|
||||
*/
|
||||
public class ReutersDocMaker extends BasicDocMaker {
|
||||
|
||||
private ThreadLocal dateFormat = new ThreadLocal();
|
||||
private File dataDir = null;
|
||||
private ArrayList inputFiles = new ArrayList();
|
||||
private int nextFile = 0;
|
||||
private int iteration=0;
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see SimpleDocMaker#setConfig(java.util.Properties)
|
||||
*/
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
File workDir = new File(config.get("work.dir","work"));
|
||||
String d = config.get("docs.dir","reuters-out");
|
||||
dataDir = new File(d);
|
||||
if (!dataDir.isAbsolute()) {
|
||||
dataDir = new File(workDir, d);
|
||||
}
|
||||
resetUniqueBytes();
|
||||
inputFiles.clear();
|
||||
collectFiles(dataDir,inputFiles);
|
||||
if (inputFiles.size()==0) {
|
||||
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
// get/initiate a thread-local simple date format (must do so
|
||||
// because SimpleDateFormat is not thread-safe.
|
||||
protected synchronized DateFormat getDateFormat () {
|
||||
DateFormat df = (DateFormat) dateFormat.get();
|
||||
if (df == null) {
|
||||
// date format: 30-MAR-1987 14:22:36.87
|
||||
df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
|
||||
df.setLenient(true);
|
||||
dateFormat.set(df);
|
||||
}
|
||||
return df;
|
||||
}
|
||||
|
||||
protected DocData getNextDocData() throws Exception {
|
||||
File f = null;
|
||||
String name = null;
|
||||
synchronized (this) {
|
||||
if (nextFile >= inputFiles.size()) {
|
||||
// exhausted files, start a new round, unless forever set to false.
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
nextFile = 0;
|
||||
iteration++;
|
||||
}
|
||||
f = (File) inputFiles.get(nextFile++);
|
||||
name = f.getCanonicalPath()+"_"+iteration;
|
||||
}
|
||||
|
||||
BufferedReader reader = new BufferedReader(new FileReader(f));
|
||||
String line = null;
|
||||
//First line is the date, 3rd is the title, rest is body
|
||||
String dateStr = reader.readLine();
|
||||
reader.readLine();//skip an empty line
|
||||
String title = reader.readLine();
|
||||
reader.readLine();//skip an empty line
|
||||
StringBuffer bodyBuf = new StringBuffer(1024);
|
||||
while ((line = reader.readLine()) != null) {
|
||||
bodyBuf.append(line).append(' ');
|
||||
}
|
||||
reader.close();
|
||||
|
||||
addBytes(f.length());
|
||||
|
||||
|
||||
Date date = getDateFormat().parse(dateStr.trim());
|
||||
return new DocData(name, bodyBuf.toString(), title, null, date);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#resetIinputs()
|
||||
*/
|
||||
public synchronized void resetInputs() {
|
||||
super.resetInputs();
|
||||
nextFile = 0;
|
||||
iteration = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#numUniqueTexts()
|
||||
*/
|
||||
public int numUniqueTexts() {
|
||||
return inputFiles.size();
|
||||
}
|
||||
|
||||
}
|
|
@ -71,7 +71,7 @@ public class ReutersQueryMaker extends AbstractQueryMaker implements QueryMaker
|
|||
* @return array of Lucene queries
|
||||
*/
|
||||
private static Query[] createQueries(List qs, Analyzer a) {
|
||||
QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD, a);
|
||||
QueryParser qp = new QueryParser(DocMaker.BODY_FIELD, a);
|
||||
List queries = new ArrayList();
|
||||
for (int i = 0; i < qs.size(); i++) {
|
||||
try {
|
||||
|
@ -107,7 +107,7 @@ public class ReutersQueryMaker extends AbstractQueryMaker implements QueryMaker
|
|||
|
||||
List queryList = new ArrayList(20);
|
||||
queryList.addAll(Arrays.asList(STANDARD_QUERIES));
|
||||
queryList.addAll(Arrays.asList(getPrebuiltQueries(BasicDocMaker.BODY_FIELD)));
|
||||
queryList.addAll(Arrays.asList(getPrebuiltQueries(DocMaker.BODY_FIELD)));
|
||||
return createQueries(queryList, anlzr);
|
||||
}
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ import java.util.ArrayList;
|
|||
|
||||
/**
|
||||
* A QueryMaker that makes queries for a collection created
|
||||
* using {@link org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker}.
|
||||
* using {@link org.apache.lucene.benchmark.byTask.feeds.SingleDocSource}.
|
||||
*/
|
||||
public class SimpleQueryMaker extends AbstractQueryMaker implements QueryMaker {
|
||||
|
||||
|
@ -45,11 +45,11 @@ public class SimpleQueryMaker extends AbstractQueryMaker implements QueryMaker {
|
|||
Analyzer anlzr= (Analyzer) Class.forName(config.get("analyzer",
|
||||
"org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance();
|
||||
|
||||
QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD,anlzr);
|
||||
QueryParser qp = new QueryParser(DocMaker.BODY_FIELD,anlzr);
|
||||
ArrayList qq = new ArrayList();
|
||||
Query q1 = new TermQuery(new Term(BasicDocMaker.ID_FIELD,"doc2"));
|
||||
Query q1 = new TermQuery(new Term(DocMaker.ID_FIELD,"doc2"));
|
||||
qq.add(q1);
|
||||
Query q2 = new TermQuery(new Term(BasicDocMaker.BODY_FIELD,"simple"));
|
||||
Query q2 = new TermQuery(new Term(DocMaker.BODY_FIELD,"simple"));
|
||||
qq.add(q2);
|
||||
BooleanQuery bq = new BooleanQuery();
|
||||
bq.add(q1,Occur.MUST);
|
||||
|
|
|
@ -36,7 +36,7 @@ public class SimpleSloppyPhraseQueryMaker extends SimpleQueryMaker {
|
|||
// exatract some 100 words from doc text to an array
|
||||
String words[];
|
||||
ArrayList w = new ArrayList();
|
||||
StringTokenizer st = new StringTokenizer(SimpleDocMaker.DOC_TEXT);
|
||||
StringTokenizer st = new StringTokenizer(SingleDocSource.DOC_TEXT);
|
||||
while (st.hasMoreTokens() && w.size()<100) {
|
||||
w.add(st.nextToken());
|
||||
}
|
||||
|
@ -53,7 +53,7 @@ public class SimpleSloppyPhraseQueryMaker extends SimpleQueryMaker {
|
|||
q.setSlop(slop);
|
||||
int wind = wd;
|
||||
for (int i=0; i<qlen; i++) {
|
||||
q.add(new Term(BasicDocMaker.BODY_FIELD,words[wind++]));
|
||||
q.add(new Term(DocMaker.BODY_FIELD,words[wind++]));
|
||||
if (remainedSlop>0) {
|
||||
remainedSlop--;
|
||||
wind++;
|
||||
|
@ -66,7 +66,7 @@ public class SimpleSloppyPhraseQueryMaker extends SimpleQueryMaker {
|
|||
q.setSlop(slop+2*qlen);
|
||||
wind = wd+qlen+remainedSlop-1;
|
||||
for (int i=0; i<qlen; i++) {
|
||||
q.add(new Term(BasicDocMaker.BODY_FIELD,words[wind--]));
|
||||
q.add(new Term(DocMaker.BODY_FIELD,words[wind--]));
|
||||
if (remainedSlop>0) {
|
||||
remainedSlop--;
|
||||
wind--;
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -18,9 +20,9 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
*/
|
||||
|
||||
/**
|
||||
* Create documents for the test.
|
||||
* Creates the same document each time {@link #getNextDocData()} is called.
|
||||
*/
|
||||
public class SimpleDocMaker extends BasicDocMaker {
|
||||
public class SingleDocSource extends ContentSource {
|
||||
|
||||
private int docID = 0;
|
||||
|
||||
|
@ -42,33 +44,26 @@ public class SimpleDocMaker extends BasicDocMaker {
|
|||
|
||||
// return a new docid
|
||||
private synchronized int newdocid() throws NoMoreDataException {
|
||||
if (docID>0 && !forever) {
|
||||
if (docID > 0 && !forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
return docID++;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#resetIinputs()
|
||||
*/
|
||||
public synchronized void resetInputs() {
|
||||
public void close() throws IOException {}
|
||||
|
||||
public DocData getNextDocData(DocData docData) throws NoMoreDataException {
|
||||
int id = newdocid();
|
||||
addBytes(DOC_TEXT.length());
|
||||
docData.clear();
|
||||
docData.setName("doc" + id);
|
||||
docData.setBody(DOC_TEXT);
|
||||
return docData;
|
||||
}
|
||||
|
||||
public synchronized void resetInputs() throws IOException {
|
||||
super.resetInputs();
|
||||
docID = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#numUniqueTexts()
|
||||
*/
|
||||
public int numUniqueTexts() {
|
||||
return 0; // not applicable
|
||||
}
|
||||
|
||||
protected DocData getNextDocData() throws NoMoreDataException {
|
||||
int id = newdocid();
|
||||
addBytes(DOC_TEXT.length());
|
||||
return new DocData("doc"+id, DOC_TEXT, null, null, null);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
import java.util.Properties;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
/**
|
||||
* Adds fields appropriate for sorting: country,
|
||||
* random_string and sort_field (int).
|
||||
*
|
||||
*/
|
||||
public class SortableSimpleDocMaker extends SimpleDocMaker {
|
||||
private int sortRange;
|
||||
|
||||
private static String[] COUNTRIES = new String[] {"European Union", "United States", "Japan", "Germany", "China (PRC)", "United Kingdom", "France", "Italy", "Spain", "Canada", "Brazil", "Russia", "India", "South Korea", "Australia", "Mexico", "Netherlands", "Turkey", "Sweden", "Belgium", "Indonesia", "Switzerland", "Poland", "Norway", "Republic of China", "Saudi Arabia", "Austria", "Greece", "Denmark", "Iran", "South Africa", "Argentina", "Ireland", "Thailand", "Finland", "Venezuela", "Portugal", "Hong Kong", "United Arab Emirates", "Malaysia", "Czech Republic", "Colombia", "Nigeria", "Romania", "Chile", "Israel", "Singapore", "Philippines", "Pakistan", "Ukraine", "Hungary", "Algeria", "New Zealand", "Egypt", "Kuwait", "Peru", "Kazakhstan", "Slovakia", "Morocco", "Bangladesh", "Vietnam", "Qatar", "Angola", "Libya", "Iraq", "Croatia", "Luxembourg", "Sudan", "Slovenia", "Cuba", "Belarus", "Ecuador", "Serbia", "Oman", "Bulgaria", "Lithuania", "Syria", "Dominican Republic", "Tunisia", "Guatemala", "Azerbaijan", "Sri Lanka", "Kenya", "Latvia", "Turkmenistan", "Costa Rica", "Lebanon", "Uruguay", "Uzbekistan", "Yemen", "Cyprus", "Estonia", "Trinidad and Tobago", "Cameroon", "El Salvador", "Iceland", "Panama", "Bahrain", "Ivory Coast", "Ethiopia", "Tanzania", "Jordan", "Ghana", "Bosnia and Herzegovina", "Macau", "Burma", "Bolivia", "Brunei", "Botswana", "Honduras", "Gabon", "Uganda", "Jamaica", "Zambia", "Senegal", "Paraguay", "Albania", "Equatorial Guinea", "Georgia", "Democratic Republic of the Congo", "Nepal", "Afghanistan", "Cambodia", "Armenia", "Republic of the Congo", "Mozambique", "Republic of Macedonia", "Malta", "Namibia", "Madagascar", "Chad", "Burkina Faso", "Mauritius", "Mali", "The Bahamas", "Papua New Guinea", "Nicaragua", "Haiti", "Benin", "alestinian flag West Bank and Gaza", "Jersey", "Fiji", "Guinea", "Moldova", "Niger", "Laos", "Mongolia", "French Polynesia", "Kyrgyzstan", "Barbados", "Tajikistan", "Malawi", "Liechtenstein", "New Caledonia", "Kosovo", "Rwanda", "Montenegro", "Swaziland", "Guam", "Mauritania", "Guernsey", "Isle of Man", "Togo", "Somalia", "Suriname", "Aruba", "North Korea", "Zimbabwe", "Central African Republic", "Faroe Islands", "Greenland", "Sierra Leone", "Lesotho", "Cape Verde", "Eritrea", "Bhutan", "Belize", "Antigua and Barbuda", "Gibraltar", "Maldives", "San Marino", "Guyana", "Burundi", "Saint Lucia", "Djibouti", "British Virgin Islands", "Liberia", "Seychelles", "The Gambia", "Northern Mariana Islands", "Grenada", "Saint Vincent and the Grenadines", "Saint Kitts and Nevis", "East Timor", "Vanuatu", "Comoros", "Samoa", "Solomon Islands", "Guinea-Bissau", "American Samoa", "Dominica", "Micronesia", "Tonga", "Cook Islands", "Palau", "Marshall Islands", "S<EFBFBD>o Tom<6F> and Pr<50>ncipe", "Anguilla", "Kiribati", "Tuvalu", "Niue"};
|
||||
|
||||
protected DocData getNextDocData() throws NoMoreDataException {
|
||||
Random r = new Random();
|
||||
DocData doc = super.getNextDocData();
|
||||
Properties props = new Properties();
|
||||
|
||||
// random int
|
||||
props.put("sort_field", Integer.toString(nextInt(r, sortRange)));
|
||||
|
||||
// random string
|
||||
int len = nextInt(r, 2, 20);
|
||||
char[] buffer = new char[len];
|
||||
for(int i=0;i<len;i++)
|
||||
buffer[i] = (char) nextInt(r, 0x80);
|
||||
props.put("random_string", new String(buffer));
|
||||
|
||||
// random country
|
||||
props.put("country", COUNTRIES[nextInt(r, COUNTRIES.length)]);
|
||||
doc.setProps(props);
|
||||
return doc;
|
||||
}
|
||||
|
||||
private int nextInt(Random r, int lim) {
|
||||
return r.nextInt(lim);
|
||||
}
|
||||
|
||||
private int nextInt(Random r, int start, int end) {
|
||||
return start + r.nextInt(end-start);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see SimpleDocMaker#setConfig(java.util.Properties)
|
||||
*/
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
sortRange = config.get("sort.rng", 20000);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
import java.util.Properties;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
/**
|
||||
* Adds fields appropriate for sorting: country, random_string and sort_field
|
||||
* (int). Supports the following parameters:
|
||||
* <ul>
|
||||
* <li><b>sort.rng</b> - defines the range for sort-by-int field (default
|
||||
* <b>20000</b>).
|
||||
* <li><b>rand.seed</b> - defines the seed to initialize Random with (default
|
||||
* <b>13</b>).
|
||||
* </ul>
|
||||
*/
|
||||
public class SortableSingleDocSource extends SingleDocSource {
|
||||
|
||||
private static String[] COUNTRIES = new String[] {
|
||||
"European Union", "United States", "Japan", "Germany", "China (PRC)",
|
||||
"United Kingdom", "France", "Italy", "Spain", "Canada", "Brazil", "Russia",
|
||||
"India", "South Korea", "Australia", "Mexico", "Netherlands", "Turkey",
|
||||
"Sweden", "Belgium", "Indonesia", "Switzerland", "Poland", "Norway",
|
||||
"Republic of China", "Saudi Arabia", "Austria", "Greece", "Denmark", "Iran",
|
||||
"South Africa", "Argentina", "Ireland", "Thailand", "Finland", "Venezuela",
|
||||
"Portugal", "Hong Kong", "United Arab Emirates", "Malaysia",
|
||||
"Czech Republic", "Colombia", "Nigeria", "Romania", "Chile", "Israel",
|
||||
"Singapore", "Philippines", "Pakistan", "Ukraine", "Hungary", "Algeria",
|
||||
"New Zealand", "Egypt", "Kuwait", "Peru", "Kazakhstan", "Slovakia",
|
||||
"Morocco", "Bangladesh", "Vietnam", "Qatar", "Angola", "Libya", "Iraq",
|
||||
"Croatia", "Luxembourg", "Sudan", "Slovenia", "Cuba", "Belarus", "Ecuador",
|
||||
"Serbia", "Oman", "Bulgaria", "Lithuania", "Syria", "Dominican Republic",
|
||||
"Tunisia", "Guatemala", "Azerbaijan", "Sri Lanka", "Kenya", "Latvia",
|
||||
"Turkmenistan", "Costa Rica", "Lebanon", "Uruguay", "Uzbekistan", "Yemen",
|
||||
"Cyprus", "Estonia", "Trinidad and Tobago", "Cameroon", "El Salvador",
|
||||
"Iceland", "Panama", "Bahrain", "Ivory Coast", "Ethiopia", "Tanzania",
|
||||
"Jordan", "Ghana", "Bosnia and Herzegovina", "Macau", "Burma", "Bolivia",
|
||||
"Brunei", "Botswana", "Honduras", "Gabon", "Uganda", "Jamaica", "Zambia",
|
||||
"Senegal", "Paraguay", "Albania", "Equatorial Guinea", "Georgia",
|
||||
"Democratic Republic of the Congo", "Nepal", "Afghanistan", "Cambodia",
|
||||
"Armenia", "Republic of the Congo", "Mozambique", "Republic of Macedonia",
|
||||
"Malta", "Namibia", "Madagascar", "Chad", "Burkina Faso", "Mauritius",
|
||||
"Mali", "The Bahamas", "Papua New Guinea", "Nicaragua", "Haiti", "Benin",
|
||||
"alestinian flag West Bank and Gaza", "Jersey", "Fiji", "Guinea", "Moldova",
|
||||
"Niger", "Laos", "Mongolia", "French Polynesia", "Kyrgyzstan", "Barbados",
|
||||
"Tajikistan", "Malawi", "Liechtenstein", "New Caledonia", "Kosovo",
|
||||
"Rwanda", "Montenegro", "Swaziland", "Guam", "Mauritania", "Guernsey",
|
||||
"Isle of Man", "Togo", "Somalia", "Suriname", "Aruba", "North Korea",
|
||||
"Zimbabwe", "Central African Republic", "Faroe Islands", "Greenland",
|
||||
"Sierra Leone", "Lesotho", "Cape Verde", "Eritrea", "Bhutan", "Belize",
|
||||
"Antigua and Barbuda", "Gibraltar", "Maldives", "San Marino", "Guyana",
|
||||
"Burundi", "Saint Lucia", "Djibouti", "British Virgin Islands", "Liberia",
|
||||
"Seychelles", "The Gambia", "Northern Mariana Islands", "Grenada",
|
||||
"Saint Vincent and the Grenadines", "Saint Kitts and Nevis", "East Timor",
|
||||
"Vanuatu", "Comoros", "Samoa", "Solomon Islands", "Guinea-Bissau",
|
||||
"American Samoa", "Dominica", "Micronesia", "Tonga", "Cook Islands",
|
||||
"Palau", "Marshall Islands", "S<EFBFBD>o Tom<6F> and Pr<50>ncipe", "Anguilla",
|
||||
"Kiribati", "Tuvalu", "Niue" };
|
||||
|
||||
private int sortRange;
|
||||
private Random r;
|
||||
|
||||
public DocData getNextDocData(DocData docData) throws NoMoreDataException {
|
||||
docData = super.getNextDocData(docData);
|
||||
Properties props = new Properties();
|
||||
|
||||
// random int
|
||||
props.put("sort_field", Integer.toString(r.nextInt(sortRange)));
|
||||
|
||||
// random string
|
||||
int len = nextInt(2, 20);
|
||||
char[] buffer = new char[len];
|
||||
for (int i = 0; i < len; i++) {
|
||||
buffer[i] = (char) r.nextInt(0x80);
|
||||
}
|
||||
props.put("random_string", new String(buffer));
|
||||
|
||||
// random country
|
||||
props.put("country", COUNTRIES[r.nextInt(COUNTRIES.length)]);
|
||||
docData.setProps(props);
|
||||
return docData;
|
||||
}
|
||||
|
||||
private int nextInt(int start, int end) {
|
||||
return start + r.nextInt(end - start);
|
||||
}
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
sortRange = config.get("sort.rng", 20000);
|
||||
r = new Random(config.get("rand.seed", 13));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,339 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParsePosition;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StringBufferReader;
|
||||
|
||||
/**
|
||||
* Implements a {@link ContentSource} over the TREC collection.
|
||||
* <p>
|
||||
* Supports the following configuration parameters (on top of
|
||||
* {@link ContentSource}):
|
||||
* <ul>
|
||||
* <li><b>work.dir</b> - specifies the working directory. Required if "docs.dir"
|
||||
* denotes a relative path (<b>default=work</b>).
|
||||
* <li><b>docs.dir</b> - specifies the directory where the TREC files reside.
|
||||
* Can be set to a relative path if "work.dir" is also specified
|
||||
* (<b>default=trec</b>).
|
||||
* <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
|
||||
* parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
|
||||
* </ul>
|
||||
*/
|
||||
public class TrecContentSource extends ContentSource {
|
||||
// TODO (3.0): change StringBuffer to StringBuffer
|
||||
|
||||
private static final class DateFormatInfo {
|
||||
DateFormat[] dfs;
|
||||
ParsePosition pos;
|
||||
}
|
||||
|
||||
private static final String DATE = "Date: ";
|
||||
private static final String DOCHDR = "<DOCHDR>";
|
||||
private static final String TERMINATING_DOCHDR = "</DOCHDR>";
|
||||
private static final String DOCNO = "<DOCNO>";
|
||||
private static final String TERMINATING_DOCNO = "</DOCNO>";
|
||||
private static final String DOC = "<DOC>";
|
||||
private static final String TERMINATING_DOC = "</DOC>";
|
||||
|
||||
private static final String NEW_LINE = System.getProperty("line.separator");
|
||||
|
||||
private static final String DATE_FORMATS [] = {
|
||||
"EEE, dd MMM yyyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT
|
||||
"EEE MMM dd kk:mm:ss yyyy z", // Tue Dec 09 16:45:08 2003 EST
|
||||
"EEE, dd-MMM-':'y kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT
|
||||
"EEE, dd-MMM-yyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT
|
||||
"EEE MMM dd kk:mm:ss yyyy", // Tue Dec 09 16:45:08 2003
|
||||
};
|
||||
|
||||
private ThreadLocal dateFormats = new ThreadLocal();
|
||||
private ThreadLocal trecDocReader = new ThreadLocal();
|
||||
private ThreadLocal trecDocBuffer = new ThreadLocal();
|
||||
private File dataDir = null;
|
||||
private ArrayList inputFiles = new ArrayList();
|
||||
private int nextFile = 0;
|
||||
private int rawDocSize;
|
||||
|
||||
// Use to synchronize threads on reading from the TREC documents.
|
||||
private Object lock = new Object();
|
||||
|
||||
// Required for test
|
||||
BufferedReader reader;
|
||||
int iteration = 0;
|
||||
HTMLParser htmlParser;
|
||||
|
||||
private DateFormatInfo getDateFormatInfo() {
|
||||
DateFormatInfo dfi = (DateFormatInfo) dateFormats.get();
|
||||
if (dfi == null) {
|
||||
dfi = new DateFormatInfo();
|
||||
dfi.dfs = new SimpleDateFormat[DATE_FORMATS.length];
|
||||
for (int i = 0; i < dfi.dfs.length; i++) {
|
||||
dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.US);
|
||||
dfi.dfs[i].setLenient(true);
|
||||
}
|
||||
dfi.pos = new ParsePosition(0);
|
||||
dateFormats.set(dfi);
|
||||
}
|
||||
return dfi;
|
||||
}
|
||||
|
||||
private StringBuffer getDocBuffer() {
|
||||
StringBuffer sb = (StringBuffer) trecDocBuffer.get();
|
||||
if (sb == null) {
|
||||
sb = new StringBuffer();
|
||||
trecDocBuffer.set(sb);
|
||||
}
|
||||
return sb;
|
||||
}
|
||||
|
||||
private Reader getTrecDocReader(StringBuffer docBuffer) {
|
||||
StringBufferReader r = (StringBufferReader) trecDocReader.get();
|
||||
if (r == null) {
|
||||
r = new StringBufferReader(docBuffer);
|
||||
trecDocReader.set(r);
|
||||
} else {
|
||||
r.set(docBuffer);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
// read until finding a line that starts with the specified prefix, or a terminating tag has been found.
|
||||
private void read(StringBuffer buf, String prefix, boolean collectMatchLine,
|
||||
boolean collectAll, String terminatingTag)
|
||||
throws IOException, NoMoreDataException {
|
||||
String sep = "";
|
||||
while (true) {
|
||||
String line = reader.readLine();
|
||||
|
||||
if (line == null) {
|
||||
openNextFile();
|
||||
continue;
|
||||
}
|
||||
|
||||
rawDocSize += line.length();
|
||||
|
||||
if (line.startsWith(prefix)) {
|
||||
if (collectMatchLine) {
|
||||
buf.append(sep).append(line);
|
||||
sep = NEW_LINE;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (terminatingTag != null && line.startsWith(terminatingTag)) {
|
||||
// didn't find the prefix that was asked, but the terminating
|
||||
// tag was found. set the length to 0 to signal no match was
|
||||
// found.
|
||||
buf.setLength(0);
|
||||
break;
|
||||
}
|
||||
|
||||
if (collectAll) {
|
||||
buf.append(sep).append(line);
|
||||
sep = NEW_LINE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void openNextFile() throws NoMoreDataException, IOException {
|
||||
close();
|
||||
int retries = 0;
|
||||
while (true) {
|
||||
if (nextFile >= inputFiles.size()) {
|
||||
// exhausted files, start a new round, unless forever set to false.
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
nextFile = 0;
|
||||
iteration++;
|
||||
}
|
||||
File f = (File) inputFiles.get(nextFile++);
|
||||
if (verbose) {
|
||||
System.out.println("opening: " + f + " length: " + f.length());
|
||||
}
|
||||
try {
|
||||
GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), 1 << 16);
|
||||
reader = new BufferedReader(new InputStreamReader(zis), 1 << 16);
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
retries++;
|
||||
if (retries < 20 && verbose) {
|
||||
System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + " #retries=" + retries);
|
||||
continue;
|
||||
}
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Date parseDate(String dateStr) {
|
||||
dateStr = dateStr.trim();
|
||||
DateFormatInfo dfi = getDateFormatInfo();
|
||||
for (int i = 0; i < dfi.dfs.length; i++) {
|
||||
DateFormat df = dfi.dfs[i];
|
||||
dfi.pos.setIndex(0);
|
||||
dfi.pos.setErrorIndex(-1);
|
||||
Date d = df.parse(dateStr, dfi.pos);
|
||||
if (d != null) {
|
||||
// Parse succeeded.
|
||||
return d;
|
||||
}
|
||||
}
|
||||
// do not fail test just because a date could not be parsed
|
||||
if (verbose) {
|
||||
System.out.println("failed to parse date (assigning 'now') for: " + dateStr);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
if (reader == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
if (verbose) {
|
||||
System.out.println("failed to close reader !");
|
||||
e.printStackTrace(System.out);
|
||||
}
|
||||
}
|
||||
reader = null;
|
||||
}
|
||||
|
||||
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
|
||||
String dateStr = null, name = null;
|
||||
Reader r = null;
|
||||
// protect reading from the TREC files by multiple threads. The rest of the
|
||||
// method, i.e., parsing the content and returning the DocData can run
|
||||
// unprotected.
|
||||
synchronized (lock) {
|
||||
if (reader == null) {
|
||||
openNextFile();
|
||||
}
|
||||
|
||||
StringBuffer docBuf = getDocBuffer();
|
||||
|
||||
// 1. skip until doc start
|
||||
docBuf.setLength(0);
|
||||
read(docBuf, DOC, false, false, null);
|
||||
|
||||
// 2. name
|
||||
docBuf.setLength(0);
|
||||
read(docBuf, DOCNO, true, false, null);
|
||||
name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
|
||||
DOCNO.length()));
|
||||
name = name + "_" + iteration;
|
||||
|
||||
// 3. skip until doc header
|
||||
docBuf.setLength(0);
|
||||
read(docBuf, DOCHDR, false, false, null);
|
||||
|
||||
boolean findTerminatingDocHdr = false;
|
||||
|
||||
// 4. date - look for the date only until /DOCHDR
|
||||
docBuf.setLength(0);
|
||||
read(docBuf, DATE, true, false, TERMINATING_DOCHDR);
|
||||
if (docBuf.length() != 0) {
|
||||
// Date found.
|
||||
dateStr = docBuf.substring(DATE.length());
|
||||
findTerminatingDocHdr = true;
|
||||
}
|
||||
|
||||
// 5. skip until end of doc header
|
||||
if (findTerminatingDocHdr) {
|
||||
docBuf.setLength(0);
|
||||
read(docBuf, TERMINATING_DOCHDR, false, false, null);
|
||||
}
|
||||
|
||||
// 6. collect until end of doc
|
||||
docBuf.setLength(0);
|
||||
read(docBuf, TERMINATING_DOC, false, true, null);
|
||||
|
||||
// 7. Set up a Reader over the read content
|
||||
r = getTrecDocReader(docBuf);
|
||||
// Resetting the thread's reader means it will reuse the instance
|
||||
// allocated as well as re-read from docBuf.
|
||||
r.reset();
|
||||
|
||||
// count char length of parsed html text (larger than the plain doc body text).
|
||||
addBytes(docBuf.length());
|
||||
}
|
||||
|
||||
// This code segment relies on HtmlParser being thread safe. When we get
|
||||
// here, everything else is already private to that thread, so we're safe.
|
||||
Date date = dateStr != null ? parseDate(dateStr) : null;
|
||||
try {
|
||||
docData = htmlParser.parse(docData, name, date, r, null);
|
||||
addDoc();
|
||||
} catch (InterruptedException e) {
|
||||
IOException ex = new IOException(e.getMessage());
|
||||
ex.initCause(e);
|
||||
throw ex;
|
||||
}
|
||||
|
||||
return docData;
|
||||
}
|
||||
|
||||
public void resetInputs() throws IOException {
|
||||
synchronized (lock) {
|
||||
super.resetInputs();
|
||||
close();
|
||||
nextFile = 0;
|
||||
iteration = 0;
|
||||
}
|
||||
}
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
File workDir = new File(config.get("work.dir", "work"));
|
||||
String d = config.get("docs.dir", "trec");
|
||||
dataDir = new File(d);
|
||||
if (!dataDir.isAbsolute()) {
|
||||
dataDir = new File(workDir, d);
|
||||
}
|
||||
collectFiles(dataDir, inputFiles);
|
||||
if (inputFiles.size() == 0) {
|
||||
throw new IllegalArgumentException("No files in dataDir: " + dataDir);
|
||||
}
|
||||
try {
|
||||
String parserClassName = config.get("html.parser",
|
||||
"org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser");
|
||||
htmlParser = (HTMLParser) Class.forName(parserClassName).newInstance();
|
||||
} catch (Exception e) {
|
||||
// Should not get here. Throw runtime exception.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,262 +0,0 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
/**
|
||||
* A DocMaker using the (compressed) Trec collection for its input.
|
||||
* <p>
|
||||
* Config properties:<ul>
|
||||
* <li>work.dir=<path to the root of docs and indexes dirs| Default: work></li>
|
||||
* <li>docs.dir=<path to the docs dir| Default: trec></li>
|
||||
* </ul>
|
||||
*/
|
||||
public class TrecDocMaker extends BasicDocMaker {
|
||||
|
||||
private static final String DATE = "Date: ";
|
||||
private static final String DOCHDR = "<DOCHDR>";
|
||||
private static final String TERM_DOCHDR = "</DOCHDR>";
|
||||
private static final String TERM_DOCNO = "</DOCNO>";
|
||||
private static final String DOCNO = "<DOCNO>";
|
||||
private static final String TERM_DOC = "</DOC>";
|
||||
private static final String DOC = "<DOC>";
|
||||
private static final String NEW_LINE = System.getProperty("line.separator");
|
||||
|
||||
protected ThreadLocal dateFormat = new ThreadLocal();
|
||||
protected File dataDir = null;
|
||||
protected ArrayList inputFiles = new ArrayList();
|
||||
protected int nextFile = 0;
|
||||
protected int iteration=0;
|
||||
protected BufferedReader reader;
|
||||
private GZIPInputStream zis;
|
||||
|
||||
private static final String DATE_FORMATS [] = {
|
||||
"EEE, dd MMM yyyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
|
||||
"EEE MMM dd kk:mm:ss yyyy z", //Tue Dec 09 16:45:08 2003 EST
|
||||
"EEE, dd-MMM-':'y kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
|
||||
"EEE, dd-MMM-yyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
|
||||
};
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see SimpleDocMaker#setConfig(java.util.Properties)
|
||||
*/
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
File workDir = new File(config.get("work.dir","work"));
|
||||
String d = config.get("docs.dir","trec");
|
||||
dataDir = new File(d);
|
||||
if (!dataDir.isAbsolute()) {
|
||||
dataDir = new File(workDir, d);
|
||||
}
|
||||
resetUniqueBytes();
|
||||
inputFiles.clear();
|
||||
collectFiles(dataDir,inputFiles);
|
||||
if (inputFiles.size()==0) {
|
||||
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
protected void openNextFile() throws NoMoreDataException, Exception {
|
||||
closeInputs();
|
||||
int retries = 0;
|
||||
while (true) {
|
||||
File f = null;
|
||||
synchronized (this) {
|
||||
if (nextFile >= inputFiles.size()) {
|
||||
// exhausted files, start a new round, unless forever set to false.
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
nextFile = 0;
|
||||
iteration++;
|
||||
}
|
||||
f = (File) inputFiles.get(nextFile++);
|
||||
}
|
||||
System.out.println("opening: "+f+" length: "+f.length());
|
||||
try {
|
||||
zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f)));
|
||||
reader = new BufferedReader(new InputStreamReader(zis));
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
retries++;
|
||||
if (retries<20) {
|
||||
System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries);
|
||||
continue;
|
||||
} else {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void closeInputs() {
|
||||
if (zis!=null) {
|
||||
try {
|
||||
zis.close();
|
||||
} catch (IOException e) {
|
||||
System.out.println("closeInputs(): Ingnoring error: "+e);
|
||||
e.printStackTrace();
|
||||
}
|
||||
zis = null;
|
||||
}
|
||||
if (reader!=null) {
|
||||
try {
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
System.out.println("closeInputs(): Ingnoring error: "+e);
|
||||
e.printStackTrace();
|
||||
}
|
||||
reader = null;
|
||||
}
|
||||
}
|
||||
|
||||
// read until finding a line that starts with the specified prefix
|
||||
protected StringBuffer read(String prefix, StringBuffer sb,
|
||||
boolean collectMatchLine, boolean collectAll,
|
||||
String terminatingTag) throws Exception {
|
||||
sb = (sb==null ? new StringBuffer() : sb);
|
||||
String sep = "";
|
||||
while (true) {
|
||||
String line = reader.readLine();
|
||||
if (line == null) {
|
||||
openNextFile();
|
||||
continue;
|
||||
}
|
||||
if (line.startsWith(prefix)) {
|
||||
if (collectMatchLine) {
|
||||
sb.append(sep).append(line);
|
||||
sep = NEW_LINE;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (terminatingTag != null && line.startsWith(terminatingTag)) {
|
||||
// didn't find the prefix that was asked, but the terminating
|
||||
// tag was found. set the length to 0 to signal no match was
|
||||
// found.
|
||||
sb.setLength(0);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
if (collectAll) {
|
||||
sb.append(sep).append(line);
|
||||
sep = NEW_LINE;
|
||||
}
|
||||
}
|
||||
//System.out.println("read: "+sb);
|
||||
return sb;
|
||||
}
|
||||
|
||||
protected synchronized DocData getNextDocData() throws NoMoreDataException, Exception {
|
||||
if (reader==null) {
|
||||
openNextFile();
|
||||
}
|
||||
// 1. skip until doc start
|
||||
read(DOC,null,false,false,null);
|
||||
// 2. name
|
||||
StringBuffer sb = read(DOCNO,null,true,false,null);
|
||||
String name = sb.substring(DOCNO.length(), sb.indexOf(TERM_DOCNO, DOCNO.length()));
|
||||
name = name + "_" + iteration;
|
||||
// 3. skip until doc header
|
||||
read(DOCHDR,null,false,false,null);
|
||||
boolean findTerminatingDocHdr = false;
|
||||
// 4. date
|
||||
sb = read(DATE,null,true,false,TERM_DOCHDR);
|
||||
String dateStr = null;
|
||||
if (sb.length() != 0) {
|
||||
// Date found.
|
||||
dateStr = sb.substring(DATE.length());
|
||||
findTerminatingDocHdr = true;
|
||||
}
|
||||
|
||||
// 5. skip until end of doc header
|
||||
if (findTerminatingDocHdr) {
|
||||
read(TERM_DOCHDR,null,false,false,null);
|
||||
}
|
||||
// 6. collect until end of doc
|
||||
sb = read(TERM_DOC,null,false,true,null);
|
||||
// this is the next document, so parse it
|
||||
Date date = dateStr != null ? parseDate(dateStr) : null;
|
||||
HTMLParser p = getHtmlParser();
|
||||
DocData docData = p.parse(name, date, sb, getDateFormat(0));
|
||||
addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text).
|
||||
|
||||
return docData;
|
||||
}
|
||||
|
||||
protected DateFormat getDateFormat(int n) {
|
||||
DateFormat df[] = (DateFormat[]) dateFormat.get();
|
||||
if (df == null) {
|
||||
df = new SimpleDateFormat[DATE_FORMATS.length];
|
||||
for (int i = 0; i < df.length; i++) {
|
||||
df[i] = new SimpleDateFormat(DATE_FORMATS[i],Locale.US);
|
||||
df[i].setLenient(true);
|
||||
}
|
||||
dateFormat.set(df);
|
||||
}
|
||||
return df[n];
|
||||
}
|
||||
|
||||
protected Date parseDate(String dateStr) {
|
||||
for (int i = 0; i < DATE_FORMATS.length; i++) {
|
||||
try {
|
||||
return getDateFormat(i).parse(dateStr.trim());
|
||||
} catch (ParseException e) {}
|
||||
}
|
||||
// do not fail test just because a date could not be parsed
|
||||
System.out.println("ignoring date parse exception (assigning 'null') for: "+dateStr);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#resetIinputs()
|
||||
*/
|
||||
public synchronized void resetInputs() {
|
||||
super.resetInputs();
|
||||
closeInputs();
|
||||
nextFile = 0;
|
||||
iteration = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see DocMaker#numUniqueTexts()
|
||||
*/
|
||||
public int numUniqueTexts() {
|
||||
return inputFiles.size();
|
||||
}
|
||||
|
||||
}
|
|
@ -80,9 +80,8 @@ public class Sample {
|
|||
Properties p = new Properties();
|
||||
p.setProperty ( "task.max.depth.log" , "3" );
|
||||
p.setProperty ( "max.buffered" , "buf:10:10:100:100:10:10:100:100" );
|
||||
p.setProperty ( "doc.maker" , "org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker" );
|
||||
p.setProperty ( "doc.add.log.step" , "2000" );
|
||||
p.setProperty ( "doc.delete.log.step" , "2000" );
|
||||
p.setProperty ( "doc.maker" , "org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource" );
|
||||
p.setProperty ( "log.step" , "2000" );
|
||||
p.setProperty ( "doc.delete.step" , "8" );
|
||||
p.setProperty ( "analyzer" , "org.apache.lucene.analysis.standard.StandardAnalyzer" );
|
||||
p.setProperty ( "doc.term.vector" , "false" );
|
||||
|
|
|
@ -20,38 +20,23 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.document.Document;
|
||||
import java.text.NumberFormat;
|
||||
|
||||
|
||||
/**
|
||||
* Add a document, optionally with of a certain size.
|
||||
* <br>Other side effects: none.
|
||||
* <br>Relevant properties: <code>doc.add.log.step</code>.
|
||||
* <br>Takes optional param: document size.
|
||||
*/
|
||||
public class AddDocTask extends PerfTask {
|
||||
|
||||
/**
|
||||
* Default value for property <code>doc.add.log.step<code> - indicating how often
|
||||
* an "added N docs" message should be logged.
|
||||
*/
|
||||
public static final int DEFAULT_ADD_DOC_LOG_STEP = 500;
|
||||
|
||||
public AddDocTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
private int logStep = -1;
|
||||
private int docSize = 0;
|
||||
int count = 0;
|
||||
|
||||
// volatile data passed between setup(), doLogic(), tearDown().
|
||||
private Document doc = null;
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see PerfTask#setup()
|
||||
*/
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
DocMaker docMaker = getRunData().getDocMaker();
|
||||
|
@ -62,33 +47,20 @@ public class AddDocTask extends PerfTask {
|
|||
}
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see PerfTask#tearDown()
|
||||
*/
|
||||
public void tearDown() throws Exception {
|
||||
log(++count);
|
||||
doc = null;
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
protected String getLogMessage(int recsCount) {
|
||||
return "added " + recsCount + " docs";
|
||||
}
|
||||
|
||||
public int doLogic() throws Exception {
|
||||
getRunData().getIndexWriter().addDocument(doc);
|
||||
return 1;
|
||||
}
|
||||
|
||||
protected void log (int count) {
|
||||
if (logStep<0) {
|
||||
// init once per instance
|
||||
logStep = getRunData().getConfig().get("doc.add.log.step",DEFAULT_ADD_DOC_LOG_STEP);
|
||||
}
|
||||
if (logStep>0 && (count%logStep)==0) {
|
||||
double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
|
||||
NumberFormat nf = NumberFormat.getInstance();
|
||||
nf.setMaximumFractionDigits(2);
|
||||
System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the params (docSize only)
|
||||
* @param params docSize, or 0 for no limit.
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocData;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
/**
|
||||
* Consumes a {@link org.apache.lucene.benchmark.byTask.feeds.ContentSource}.
|
||||
* Supports the following parameters:
|
||||
* <ul>
|
||||
* <li>content.source - the content source to use. (mandatory)
|
||||
* </ul>
|
||||
*/
|
||||
public class ConsumeContentSourceTask extends PerfTask {
|
||||
|
||||
private ContentSource source;
|
||||
private DocData dd = new DocData();
|
||||
|
||||
public ConsumeContentSourceTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
Config config = runData.getConfig();
|
||||
String sourceClass = config.get("content.source", null);
|
||||
if (sourceClass == null) {
|
||||
throw new IllegalArgumentException("content.source must be defined");
|
||||
}
|
||||
try {
|
||||
source = (ContentSource) Class.forName(sourceClass).newInstance();
|
||||
source.setConfig(config);
|
||||
source.resetInputs();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
protected String getLogMessage(int recsCount) {
|
||||
return "read " + recsCount + " documents from the content source";
|
||||
}
|
||||
|
||||
public void close() throws Exception {
|
||||
source.close();
|
||||
super.close();
|
||||
}
|
||||
|
||||
public int doLogic() throws Exception {
|
||||
dd = source.getNextDocData(dd);
|
||||
return 1;
|
||||
}
|
||||
|
||||
}
|
|
@ -22,7 +22,7 @@ import org.apache.lucene.benchmark.byTask.PerfRunData;
|
|||
/**
|
||||
* Delete a document by docid.
|
||||
* <br>Other side effects: none.
|
||||
* <br>Relevant properties: <code>doc.delete.log.step , doc.delete.step</code>.
|
||||
* <br>Relevant properties: <code>doc.delete.step, delete.log.step</code>.
|
||||
* <br>If no docid param is supplied, deletes doc with <code>id = last-deleted-doc + doc.delete.step</code>.
|
||||
* <br>Takes optional param: document id.
|
||||
*/
|
||||
|
@ -33,19 +33,16 @@ public class DeleteDocTask extends PerfTask {
|
|||
*/
|
||||
public static final int DEFAULT_DOC_DELETE_STEP = 8;
|
||||
|
||||
/**
|
||||
* Default value for property <code>doc.delete.log.step<code> - indicating how often
|
||||
* an "deleted N docs" message should be logged.
|
||||
*/
|
||||
public static final int DEFAULT_DELETE_DOC_LOG_STEP = 500;
|
||||
|
||||
public DeleteDocTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
// Override log.step, which is read by PerfTask
|
||||
int deleteLogStep = runData.getConfig().get("delete.log.step", -1);
|
||||
if (deleteLogStep != -1) {
|
||||
logStep = deleteLogStep;
|
||||
}
|
||||
}
|
||||
|
||||
private int logStep = -1;
|
||||
private int deleteStep = -1;
|
||||
private static int numDeleted = 0;
|
||||
private static int lastDeleted = -1;
|
||||
|
||||
private int docid = -1;
|
||||
|
@ -62,10 +59,6 @@ public class DeleteDocTask extends PerfTask {
|
|||
*/
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
// one time static initializations
|
||||
if (logStep<0) {
|
||||
logStep = getRunData().getConfig().get("doc.delete.log.step",DEFAULT_DELETE_DOC_LOG_STEP);
|
||||
}
|
||||
if (deleteStep<0) {
|
||||
deleteStep = getRunData().getConfig().get("doc.delete.step",DEFAULT_DOC_DELETE_STEP);
|
||||
}
|
||||
|
@ -73,18 +66,8 @@ public class DeleteDocTask extends PerfTask {
|
|||
docid = (byStep ? lastDeleted + deleteStep : docid);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see PerfTask#tearDown()
|
||||
*/
|
||||
public void tearDown() throws Exception {
|
||||
log(++numDeleted);
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private void log (int count) {
|
||||
if (logStep>0 && (count%logStep)==0) {
|
||||
System.out.println("--> processed (delete) "+count+" docs, last deleted: "+lastDeleted);
|
||||
}
|
||||
protected String getLogMessage(int recsCount) {
|
||||
return "deleted " + recsCount + " docs, last deleted: " + lastDeleted;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -17,54 +17,80 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.NumberFormat;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.stats.Points;
|
||||
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Format;
|
||||
|
||||
/**
|
||||
* A (abstract) task to be tested for performance.
|
||||
* <br>
|
||||
* Every performance task extends this class, and provides its own doLogic() method,
|
||||
* which performss the actual task.
|
||||
* <br>
|
||||
* Tasks performing some work that should be measured for the task, can overide setup() and/or tearDown() and
|
||||
* placed that work there.
|
||||
* <br>
|
||||
* An abstract task to be tested for performance. <br>
|
||||
* Every performance task extends this class, and provides its own
|
||||
* {@link #doLogic()} method, which performss the actual task. <br>
|
||||
* Tasks performing some work that should be measured for the task, can overide
|
||||
* {@link #setup()} and/or {@link #tearDown()} and place that work there. <br>
|
||||
* Relevant properties: <code>task.max.depth.log</code>.
|
||||
*/
|
||||
public abstract class PerfTask implements Cloneable {
|
||||
|
||||
private static final int DEFAULT_LOG_STEP = 1000;
|
||||
|
||||
private PerfRunData runData;
|
||||
|
||||
// propeties that all tasks have
|
||||
private String name;
|
||||
private int depth = 0;
|
||||
protected int logStep;
|
||||
private int logStepCount = 0;
|
||||
private int maxDepthLogStart = 0;
|
||||
private boolean disableCounting = false;
|
||||
protected String params = null;
|
||||
|
||||
protected static final String NEW_LINE = System.getProperty("line.separator");
|
||||
|
||||
/**
|
||||
* Should not be used externally
|
||||
*/
|
||||
/** Should not be used externally */
|
||||
private PerfTask() {
|
||||
name = Format.simpleName(getClass());
|
||||
name = Format.simpleName(getClass());
|
||||
if (name.endsWith("Task")) {
|
||||
name = name.substring(0,name.length()-4);
|
||||
name = name.substring(0, name.length() - 4);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated will be removed in 3.0. checks if there are any obsolete
|
||||
* settings, like doc.add.log.step and doc.delete.log.step and
|
||||
* alerts the user.
|
||||
*/
|
||||
private void checkObsoleteSettings(Config config) {
|
||||
if (config.get("doc.add.log.step", null) != null) {
|
||||
throw new RuntimeException("doc.add.log.step is not supported anymore. " +
|
||||
"Use log.step and refer to CHANGES to read on the recent API changes " +
|
||||
"done to Benchmark's DocMaker and Task-based logging.");
|
||||
}
|
||||
|
||||
if (config.get("doc.delete.log.step", null) != null) {
|
||||
throw new RuntimeException("doc.delete.log.step is not supported anymore. " +
|
||||
"Use delete.log.step and refer to CHANGES to read on the recent API changes " +
|
||||
"done to Benchmark's DocMaker and Task-based logging.");
|
||||
}
|
||||
}
|
||||
|
||||
public PerfTask(PerfRunData runData) {
|
||||
this();
|
||||
this.runData = runData;
|
||||
this.maxDepthLogStart = runData.getConfig().get("task.max.depth.log",0);
|
||||
Config config = runData.getConfig();
|
||||
this.maxDepthLogStart = config.get("task.max.depth.log",0);
|
||||
logStep = config.get("log.step", DEFAULT_LOG_STEP);
|
||||
// To avoid the check 'if (logStep > 0)' in tearDown(). This effectively
|
||||
// turns logging off.
|
||||
if (logStep <= 0) {
|
||||
logStep = Integer.MAX_VALUE;
|
||||
}
|
||||
checkObsoleteSettings(config);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see java.lang.Object#clone()
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
// tasks having non primitive data structures should overide this.
|
||||
// otherwise parallel running of a task sequence might not run crrectly.
|
||||
|
@ -173,6 +199,10 @@ public abstract class PerfTask implements Cloneable {
|
|||
return maxDepthLogStart;
|
||||
}
|
||||
|
||||
protected String getLogMessage(int recsCount) {
|
||||
return "processed " + recsCount + " records";
|
||||
}
|
||||
|
||||
/**
|
||||
* Tasks that should never log at start can overide this.
|
||||
* @return true if this task should never log when it start.
|
||||
|
@ -207,7 +237,14 @@ public abstract class PerfTask implements Cloneable {
|
|||
* Notice that higher level (sequence) tasks containing this task would then
|
||||
* measure larger time than the sum of their contained tasks.
|
||||
*/
|
||||
public void tearDown () throws Exception {
|
||||
public void tearDown() throws Exception {
|
||||
if (++logStepCount % logStep == 0) {
|
||||
double time = (System.currentTimeMillis() - runData.getStartTimeMillis()) / 1000.0;
|
||||
NumberFormat nf = NumberFormat.getInstance();
|
||||
nf.setMaximumFractionDigits(2);
|
||||
System.out.println(nf.format(time) + " sec --> "
|
||||
+ Thread.currentThread().getName() + " " + getLogMessage(logStepCount));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -17,58 +17,44 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import java.text.NumberFormat;
|
||||
import java.io.Reader;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
/**
|
||||
* Simple task to test performance of tokenizers. It just
|
||||
* creates a token stream for each field of the document and
|
||||
* read all tokens out of that stream.
|
||||
* <br>Relevant properties: <code>doc.tokenize.log.step</code>.
|
||||
*/
|
||||
public class ReadTokensTask extends PerfTask {
|
||||
|
||||
/**
|
||||
* Default value for property <code>doc.tokenize.log.step<code> - indicating how often
|
||||
* an "added N docs / M tokens" message should be logged.
|
||||
*/
|
||||
public static final int DEFAULT_DOC_LOG_STEP = 500;
|
||||
|
||||
public ReadTokensTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
private int logStep = -1;
|
||||
int count = 0;
|
||||
int totalTokenCount = 0;
|
||||
private int totalTokenCount = 0;
|
||||
|
||||
// volatile data passed between setup(), doLogic(), tearDown().
|
||||
private Document doc = null;
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see PerfTask#setup()
|
||||
*/
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
DocMaker docMaker = getRunData().getDocMaker();
|
||||
doc = docMaker.makeDocument();
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see PerfTask#tearDown()
|
||||
*/
|
||||
protected String getLogMessage(int recsCount) {
|
||||
return "read " + recsCount + " docs; " + totalTokenCount + " tokens";
|
||||
}
|
||||
|
||||
public void tearDown() throws Exception {
|
||||
log(++count);
|
||||
doc = null;
|
||||
super.tearDown();
|
||||
}
|
||||
|
@ -117,19 +103,6 @@ public class ReadTokensTask extends PerfTask {
|
|||
return tokenCount;
|
||||
}
|
||||
|
||||
private void log(int count) {
|
||||
if (logStep<0) {
|
||||
// init once per instance
|
||||
logStep = getRunData().getConfig().get("doc.tokenize.log.step", DEFAULT_DOC_LOG_STEP);
|
||||
}
|
||||
if (logStep>0 && (count%logStep)==0) {
|
||||
double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
|
||||
NumberFormat nf = NumberFormat.getInstance();
|
||||
nf.setMaximumFractionDigits(2);
|
||||
System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs" + "; " + totalTokenCount + " tokens");
|
||||
}
|
||||
}
|
||||
|
||||
/* Simple StringReader that can be reset to a new string;
|
||||
* we use this when tokenizing the string value from a
|
||||
* Field. */
|
||||
|
|
|
@ -62,6 +62,7 @@ public class TaskSequence extends PerfTask {
|
|||
for(int i=0;i<tasksArray.length;i++) {
|
||||
tasksArray[i].close();
|
||||
}
|
||||
getRunData().getDocMaker().close();
|
||||
}
|
||||
|
||||
private void initTasksArray() {
|
||||
|
@ -106,8 +107,8 @@ public class TaskSequence extends PerfTask {
|
|||
if (isParallel()) {
|
||||
throw new Exception("REPEAT_EXHAUST is not allowed for parallel tasks");
|
||||
}
|
||||
if (getRunData().getConfig().get("doc.maker.forever",true)) {
|
||||
throw new Exception("REPEAT_EXHAUST requires setting doc.maker.forever=false");
|
||||
if (getRunData().getConfig().get("content.source.forever",true)) {
|
||||
throw new Exception("REPEAT_EXHAUST requires setting content.source.forever=false");
|
||||
}
|
||||
}
|
||||
setSequenceName();
|
||||
|
|
|
@ -19,17 +19,13 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.Term;
|
||||
import java.text.NumberFormat;
|
||||
|
||||
|
||||
/**
|
||||
* Update a document, using IndexWriter.updateDocument,
|
||||
* optionally with of a certain size.
|
||||
* <br>Other side effects: none.
|
||||
* <br>Relevant properties: <code>doc.add.log.step</code>.
|
||||
* <br>Takes optional param: document size.
|
||||
*/
|
||||
public class UpdateDocTask extends PerfTask {
|
||||
|
@ -38,17 +34,11 @@ public class UpdateDocTask extends PerfTask {
|
|||
super(runData);
|
||||
}
|
||||
|
||||
private int logStep = -1;
|
||||
private int docSize = 0;
|
||||
int count = 0;
|
||||
|
||||
// volatile data passed between setup(), doLogic(), tearDown().
|
||||
private Document doc = null;
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see PerfTask#setup()
|
||||
*/
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
DocMaker docMaker = getRunData().getDocMaker();
|
||||
|
@ -59,38 +49,24 @@ public class UpdateDocTask extends PerfTask {
|
|||
}
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see PerfTask#tearDown()
|
||||
*/
|
||||
public void tearDown() throws Exception {
|
||||
log(++count);
|
||||
doc = null;
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public int doLogic() throws Exception {
|
||||
final String docID = doc.get(BasicDocMaker.ID_FIELD);
|
||||
final String docID = doc.get(DocMaker.ID_FIELD);
|
||||
if (docID == null) {
|
||||
throw new IllegalStateException("document must define the docid field");
|
||||
}
|
||||
getRunData().getIndexWriter().updateDocument(new Term(BasicDocMaker.ID_FIELD, docID),
|
||||
doc);
|
||||
getRunData().getIndexWriter().updateDocument(new Term(DocMaker.ID_FIELD, docID), doc);
|
||||
return 1;
|
||||
}
|
||||
|
||||
private void log (int count) {
|
||||
if (logStep<0) {
|
||||
// init once per instance
|
||||
logStep = getRunData().getConfig().get("doc.add.log.step",AddDocTask.DEFAULT_ADD_DOC_LOG_STEP);
|
||||
}
|
||||
if (logStep>0 && (count%logStep)==0) {
|
||||
double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
|
||||
NumberFormat nf = NumberFormat.getInstance();
|
||||
nf.setMaximumFractionDigits(2);
|
||||
System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (update) "+count+" docs");
|
||||
}
|
||||
protected String getLogMessage(int recsCount) {
|
||||
return "updated " + recsCount + " docs";
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Set the params (docSize only)
|
||||
* @param params docSize, or 0 for no limit.
|
||||
|
|
|
@ -25,7 +25,6 @@ import java.io.OutputStreamWriter;
|
|||
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -45,23 +44,13 @@ import org.apache.lucene.document.Field;
|
|||
* <li>bzip.compression - whether the output should be bzip-compressed. This is
|
||||
* recommended when the output file is expected to be large. (optional, default:
|
||||
* false).
|
||||
* <li>doc.writeline.log.step - controls how many records to process before
|
||||
* logging the status of the task. <b>NOTE:</b> to disable logging, set this
|
||||
* value to 0 or negative. (optional, default:1000).
|
||||
* </ul>
|
||||
*/
|
||||
public class WriteLineDocTask extends PerfTask {
|
||||
|
||||
/**
|
||||
* Default value for property <code>doc.add.log.step<code> - indicating how often
|
||||
* an "added N docs" message should be logged.
|
||||
*/
|
||||
public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
|
||||
public final static char SEP = '\t';
|
||||
|
||||
private int logStep = -1;
|
||||
private int docSize = 0;
|
||||
int count = 0;
|
||||
private BufferedWriter lineFileOut = null;
|
||||
private DocMaker docMaker;
|
||||
|
||||
|
@ -93,30 +82,23 @@ public class WriteLineDocTask extends PerfTask {
|
|||
}
|
||||
lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16);
|
||||
docMaker = runData.getDocMaker();
|
||||
logStep = config.get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
|
||||
// To avoid the check 'if (logStep > 0)' in log(). This effectively turns
|
||||
// logging off.
|
||||
if (logStep <= 0) {
|
||||
logStep = Integer.MAX_VALUE;
|
||||
}
|
||||
}
|
||||
|
||||
public void tearDown() throws Exception {
|
||||
log(++count);
|
||||
super.tearDown();
|
||||
protected String getLogMessage(int recsCount) {
|
||||
return "Wrote " + recsCount + " line docs";
|
||||
}
|
||||
|
||||
|
||||
public int doLogic() throws Exception {
|
||||
Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
|
||||
|
||||
Field f = doc.getField(BasicDocMaker.BODY_FIELD);
|
||||
Field f = doc.getField(DocMaker.BODY_FIELD);
|
||||
String body = f != null ? f.stringValue().replace('\t', ' ') : null;
|
||||
|
||||
if (body != null) {
|
||||
f = doc.getField(BasicDocMaker.TITLE_FIELD);
|
||||
f = doc.getField(DocMaker.TITLE_FIELD);
|
||||
String title = f != null ? f.stringValue().replace('\t', ' ') : "";
|
||||
|
||||
f = doc.getField(BasicDocMaker.DATE_FIELD);
|
||||
f = doc.getField(DocMaker.DATE_FIELD);
|
||||
String date = f != null ? f.stringValue().replace('\t', ' ') : "";
|
||||
|
||||
lineFileOut.write(title, 0, title.length());
|
||||
|
@ -129,17 +111,6 @@ public class WriteLineDocTask extends PerfTask {
|
|||
return 1;
|
||||
}
|
||||
|
||||
private void log(int count) {
|
||||
// logStep is initialized in the ctor to a positive value. If the config
|
||||
// file indicates no logging, or contains an invalid value, logStep is init
|
||||
// to Integer.MAX_VALUE, so that logging will not occur (at least for the
|
||||
// first Integer.MAX_VALUE records).
|
||||
if (count % logStep == 0) {
|
||||
System.out.println("--> " + Thread.currentThread().getName()
|
||||
+ " processed (write line) " + count + " docs");
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws Exception {
|
||||
lineFileOut.close();
|
||||
super.close();
|
||||
|
@ -156,9 +127,6 @@ public class WriteLineDocTask extends PerfTask {
|
|||
docSize = (int) Float.parseFloat(params);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams()
|
||||
*/
|
||||
public boolean supportsParams() {
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,173 @@
|
|||
package org.apache.lucene.benchmark.byTask.utils;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* Implements a {@link Reader} over a {@link StringBuffer} instance. Although
|
||||
* one can use {@link java.io.StringReader} by passing it
|
||||
* {@link StringBuffer#toString()}, it is better to use this class, as it
|
||||
* doesn't mark the passed-in {@link StringBuffer} as shared (which will cause
|
||||
* inner char[] allocations at the next append() attempt).<br>
|
||||
* Notes:
|
||||
* <ul>
|
||||
* <li>This implementation assumes the underlying {@link StringBuffer} is not
|
||||
* changed during the use of this {@link Reader} implementation.
|
||||
* <li>This implementation is thread-safe.
|
||||
* <li>The implementation looks very much like {@link java.io.StringReader} (for
|
||||
* the right reasons).
|
||||
* <li>If one wants to reuse that instance, then the following needs to be done:
|
||||
* <pre>
|
||||
* StringBuffer sb = new StringBuffer("some text");
|
||||
* Reader reader = new StringBufferReader(sb);
|
||||
* ... read from reader - dont close it ! ...
|
||||
* sb.setLength(0);
|
||||
* sb.append("some new text");
|
||||
* reader.reset();
|
||||
* ... read the new string from the reader ...
|
||||
* </pre>
|
||||
* </ul>
|
||||
*/
|
||||
public class StringBufferReader extends Reader {
|
||||
|
||||
// TODO (3.0): change to StringBuffer (including the name of the class)
|
||||
|
||||
// The StringBuffer to read from.
|
||||
private StringBuffer sb;
|
||||
|
||||
// The length of 'sb'.
|
||||
private int length;
|
||||
|
||||
// The next position to read from the StringBuffer.
|
||||
private int next = 0;
|
||||
|
||||
// The mark position. The default value 0 means the start of the text.
|
||||
private int mark = 0;
|
||||
|
||||
public StringBufferReader(StringBuffer sb) {
|
||||
set(sb);
|
||||
}
|
||||
|
||||
/** Check to make sure that the stream has not been closed. */
|
||||
private void ensureOpen() throws IOException {
|
||||
if (sb == null) {
|
||||
throw new IOException("Stream has already been closed");
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
synchronized (lock) {
|
||||
sb = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark the present position in the stream. Subsequent calls to reset() will
|
||||
* reposition the stream to this point.
|
||||
*
|
||||
* @param readAheadLimit Limit on the number of characters that may be read
|
||||
* while still preserving the mark. Because the stream's input comes
|
||||
* from a StringBuffer, there is no actual limit, so this argument
|
||||
* must not be negative, but is otherwise ignored.
|
||||
* @exception IllegalArgumentException If readAheadLimit is < 0
|
||||
* @exception IOException If an I/O error occurs
|
||||
*/
|
||||
public void mark(int readAheadLimit) throws IOException {
|
||||
if (readAheadLimit < 0){
|
||||
throw new IllegalArgumentException("Read-ahead limit cannpt be negative: " + readAheadLimit);
|
||||
}
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
mark = next;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean markSupported() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
return next >= length ? -1 : sb.charAt(next++);
|
||||
}
|
||||
}
|
||||
|
||||
public int read(char cbuf[], int off, int len) throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
|
||||
// Validate parameters
|
||||
if (off < 0 || off > cbuf.length || len < 0 || off + len > cbuf.length) {
|
||||
throw new IndexOutOfBoundsException("off=" + off + " len=" + len + " cbuf.length=" + cbuf.length);
|
||||
}
|
||||
|
||||
if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (next >= length) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int n = Math.min(length - next, len);
|
||||
sb.getChars(next, next + n, cbuf, off);
|
||||
next += n;
|
||||
return n;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean ready() throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
next = mark;
|
||||
length = sb.length();
|
||||
}
|
||||
}
|
||||
|
||||
public void set(StringBuffer sb) {
|
||||
synchronized (lock) {
|
||||
this.sb = sb;
|
||||
length = sb.length();
|
||||
}
|
||||
}
|
||||
public long skip(long ns) throws IOException {
|
||||
synchronized (lock) {
|
||||
ensureOpen();
|
||||
if (next >= length) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Bound skip by beginning and end of the source
|
||||
long n = Math.min(length - next, ns);
|
||||
n = Math.max(-next, n);
|
||||
next += n;
|
||||
return n;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -17,18 +17,17 @@ package org.apache.lucene.benchmark.utils;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Extract the downloaded Wikipedia dump into separate files for indexing.
|
||||
*/
|
||||
|
@ -51,7 +50,6 @@ public class ExtractWikipedia {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
public File directory(int count, File directory) {
|
||||
if (directory == null) {
|
||||
directory = outputDir;
|
||||
|
@ -99,7 +97,8 @@ public class ExtractWikipedia {
|
|||
long start = System.currentTimeMillis();
|
||||
try {
|
||||
while ((doc = docMaker.makeDocument()) != null) {
|
||||
create(doc.get(BasicDocMaker.ID_FIELD), doc.get(BasicDocMaker.TITLE_FIELD), doc.get(BasicDocMaker.DATE_FIELD), doc.get(BasicDocMaker.BODY_FIELD));
|
||||
create(doc.get(DocMaker.ID_FIELD), doc.get(DocMaker.TITLE_FIELD), doc
|
||||
.get(DocMaker.DATE_FIELD), doc.get(DocMaker.BODY_FIELD));
|
||||
}
|
||||
} catch (NoMoreDataException e) {
|
||||
//continue
|
||||
|
@ -130,7 +129,7 @@ public class ExtractWikipedia {
|
|||
Properties properties = new Properties();
|
||||
|
||||
properties.setProperty("docs.file", wikipedia.getAbsolutePath());
|
||||
properties.setProperty("doc.maker.forever", "false");
|
||||
properties.setProperty("content.source.forever", "false");
|
||||
properties.setProperty("keep.image.only.docs", String.valueOf(keepImageOnlyDocs));
|
||||
docMaker.setConfig(new Config(properties));
|
||||
docMaker.resetInputs();
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
package org.apache.lucene.benchmark.byTask;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
|
@ -26,7 +27,7 @@ import java.util.Iterator;
|
|||
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
|
||||
|
@ -114,7 +115,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
};
|
||||
|
||||
CountingSearchTestTask.numSearches = 0;
|
||||
Benchmark benchmark = execBenchmark(algLines);
|
||||
execBenchmark(algLines);
|
||||
assertTrue(CountingSearchTestTask.numSearches > 0);
|
||||
long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis;
|
||||
assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500);
|
||||
|
@ -124,7 +125,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"doc.stored=true",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
||||
"ResetSystemErase",
|
||||
"CreateIndex",
|
||||
|
@ -162,7 +163,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
String algLines[] = {
|
||||
"doc.stored=true",//doc storage is required in order to have text to highlight
|
||||
"doc.term.vector.offsets=true",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
||||
"ResetSystemErase",
|
||||
"CreateIndex",
|
||||
|
@ -199,7 +200,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"doc.stored=false",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"query.maker=" + ReutersQueryMaker.class.getName(),
|
||||
"ResetSystemErase",
|
||||
"CreateIndex",
|
||||
|
@ -227,14 +228,14 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
/**
|
||||
* Test Exhasting Doc Maker logic
|
||||
*/
|
||||
public void testExhaustDocMaker() throws Exception {
|
||||
public void testExhaustContentSource() throws Exception {
|
||||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker",
|
||||
"doc.add.log.step=1",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource",
|
||||
"content.source.log.step=1",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"doc.stored=false",
|
||||
"doc.tokenized=false",
|
||||
|
@ -274,10 +275,10 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"doc.add.log.step=3",
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"content.source.log.step=3",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=FSDirectory",
|
||||
"doc.stored=false",
|
||||
"doc.tokenized=false",
|
||||
|
@ -292,7 +293,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
|
||||
// 3. test number of docs in the index
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
|
||||
int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
|
||||
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
|
||||
ir.close();
|
||||
}
|
||||
|
@ -309,8 +310,8 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// Creates a line file with first 500 docs from reuters
|
||||
String algLines1[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
|
||||
"doc.maker.forever=false",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource",
|
||||
"content.source.forever=false",
|
||||
"line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'),
|
||||
"# ----- alg ",
|
||||
"{WriteLineDoc()}:" + NUM_TRY_DOCS,
|
||||
|
@ -335,7 +336,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
"analyzer=org.apache.lucene.analysis.SimpleAnalyzer",
|
||||
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker",
|
||||
"docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"doc.reuse.fields=false",
|
||||
"autocommit=false",
|
||||
"ram.flush.mb=4",
|
||||
|
@ -373,7 +374,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
String algLines1[] = {
|
||||
"# ----- properties ",
|
||||
"analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
|
||||
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource",
|
||||
"# ----- alg ",
|
||||
"{ReadTokens}: " + NUM_DOCS,
|
||||
"ResetSystemErase",
|
||||
|
@ -421,10 +422,10 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"doc.add.log.step=3",
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"content.source.log.step=3",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"doc.stored=false",
|
||||
"doc.tokenized=false",
|
||||
|
@ -442,7 +443,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
|
||||
// 3. test number of docs in the index
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
int ndocsExpected = 2 * 20; // Reuters20DocMaker exhausts after 20 docs.
|
||||
int ndocsExpected = 2 * 20; // Reuters20ContentSource exhausts after 20 docs.
|
||||
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
|
||||
ir.close();
|
||||
}
|
||||
|
@ -477,16 +478,19 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
}
|
||||
|
||||
/** use reuters and the exhaust mechanism, but to be faster, add 20 docs only... */
|
||||
public static class Reuters20DocMaker extends ReutersDocMaker {
|
||||
private int nDocs=0;
|
||||
protected synchronized DocData getNextDocData() throws Exception {
|
||||
if (nDocs>=20 && !forever) {
|
||||
public static class Reuters20ContentSource extends ReutersContentSource {
|
||||
private int nDocs = 0;
|
||||
|
||||
public synchronized DocData getNextDocData(DocData docData)
|
||||
throws NoMoreDataException, IOException {
|
||||
if (nDocs >= 20 && !forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
nDocs++;
|
||||
return super.getNextDocData();
|
||||
return super.getNextDocData(docData);
|
||||
}
|
||||
public synchronized void resetInputs() {
|
||||
|
||||
public synchronized void resetInputs() throws IOException {
|
||||
super.resetInputs();
|
||||
nDocs = 0;
|
||||
}
|
||||
|
@ -499,10 +503,10 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"doc.add.log.step=3",
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"content.source.log.step=3",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"doc.stored=false",
|
||||
"doc.tokenized=false",
|
||||
|
@ -521,7 +525,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
|
||||
// 3. test number of docs in the index
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
|
||||
int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
|
||||
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
|
||||
ir.close();
|
||||
}
|
||||
|
@ -533,12 +537,12 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"ram.flush.mb=-1",
|
||||
"max.buffered=2",
|
||||
"doc.add.log.step=3",
|
||||
"content.source.log.step=3",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"doc.stored=false",
|
||||
"doc.tokenized=false",
|
||||
|
@ -557,7 +561,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
|
||||
// 3. test number of docs in the index
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
|
||||
int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
|
||||
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
|
||||
ir.close();
|
||||
}
|
||||
|
@ -577,10 +581,10 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"doc.add.log.step=3",
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"content.source.log.step=3",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"merge.scheduler=" + MyMergeScheduler.class.getName(),
|
||||
"doc.stored=false",
|
||||
|
@ -601,7 +605,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
|
||||
// 3. test number of docs in the index
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
|
||||
int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
|
||||
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
|
||||
ir.close();
|
||||
}
|
||||
|
@ -620,12 +624,12 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"doc.add.log.step=3",
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"content.source.log.step=3",
|
||||
"ram.flush.mb=-1",
|
||||
"max.buffered=2",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"merge.policy=" + MyMergePolicy.class.getName(),
|
||||
"doc.stored=false",
|
||||
|
@ -646,7 +650,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
|
||||
// 3. test number of docs in the index
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
|
||||
int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
|
||||
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
|
||||
ir.close();
|
||||
}
|
||||
|
@ -658,13 +662,13 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"doc.add.log.step=3",
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"content.source.log.step=3",
|
||||
"ram.flush.mb=-1",
|
||||
"max.buffered=2",
|
||||
"compound=cmpnd:true:false",
|
||||
"doc.term.vector=vector:false:true",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"doc.stored=false",
|
||||
"merge.factor=3",
|
||||
|
@ -702,12 +706,12 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"doc.add.log.step=3",
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"content.source.log.step=3",
|
||||
"ram.flush.mb=-1",
|
||||
"max.buffered=3",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"merge.policy=org.apache.lucene.index.LogDocMergePolicy",
|
||||
"doc.stored=false",
|
||||
|
@ -728,7 +732,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
|
||||
// 3. test number of docs in the index
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
int ndocsExpected = 20; // Reuters20DocMaker exhausts after 20 docs.
|
||||
int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs.
|
||||
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
|
||||
ir.close();
|
||||
|
||||
|
@ -780,10 +784,10 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
String dis = disable ? "-" : "";
|
||||
return new String[] {
|
||||
"# ----- properties ",
|
||||
"doc.maker="+Reuters20DocMaker.class.getName(),
|
||||
"doc.add.log.step=30",
|
||||
"content.source="+Reuters20ContentSource.class.getName(),
|
||||
"content.source.log.step=30",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"doc.stored=false",
|
||||
"doc.tokenized=false",
|
||||
|
|
|
@ -111,35 +111,11 @@ public class LineDocMakerTest extends BenchmarkTestCase {
|
|||
doIndexAndSearchTest(file, false, null);
|
||||
}
|
||||
|
||||
public void testBZip2WithBzipCompressionDisabled() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file);
|
||||
|
||||
try {
|
||||
doIndexAndSearchTest(file, true, "false");
|
||||
fail("Some exception should have been thrown !");
|
||||
} catch (Exception e) {
|
||||
// expected.
|
||||
}
|
||||
}
|
||||
|
||||
public void testRegularFile() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFile(file);
|
||||
doIndexAndSearchTest(file, false, null);
|
||||
}
|
||||
|
||||
public void testRegularFileWithBZipCompressionEnabled() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFile(file);
|
||||
|
||||
try {
|
||||
doIndexAndSearchTest(file, true, "true");
|
||||
fail("Some exception should have been thrown !");
|
||||
} catch (Exception e) {
|
||||
// expected.
|
||||
}
|
||||
}
|
||||
|
||||
public void testInvalidFormat() throws Exception {
|
||||
String[] testCases = new String[] {
|
||||
|
|
|
@ -18,24 +18,29 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.text.ParseException;
|
||||
import java.util.Date;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TrecDocMakerTest extends TestCase {
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.DateTools;
|
||||
|
||||
public class TrecContentSourceTest extends TestCase {
|
||||
|
||||
/** A TrecDocMaker which works on a String and not files. */
|
||||
private static class StringableTrecDocMaker extends TrecDocMaker {
|
||||
private static class StringableTrecSource extends TrecContentSource {
|
||||
|
||||
private String docs = null;
|
||||
|
||||
public StringableTrecDocMaker(String docs, boolean forever) {
|
||||
public StringableTrecSource(String docs, boolean forever) {
|
||||
this.docs = docs;
|
||||
this.forever = forever;
|
||||
}
|
||||
|
||||
protected void openNextFile() throws NoMoreDataException, Exception {
|
||||
protected void openNextFile() throws NoMoreDataException, IOException {
|
||||
if (reader != null) {
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
|
@ -46,20 +51,26 @@ public class TrecDocMakerTest extends TestCase {
|
|||
reader = new BufferedReader(new StringReader(docs));
|
||||
}
|
||||
|
||||
public void setConfig(Config config) {
|
||||
htmlParser = new DemoHTMLParser();
|
||||
}
|
||||
}
|
||||
|
||||
private void assertDocData(DocData dd, String expName, String expTitle, String expBody, Date expDate) {
|
||||
private void assertDocData(DocData dd, String expName, String expTitle,
|
||||
String expBody, Date expDate)
|
||||
throws ParseException {
|
||||
assertNotNull(dd);
|
||||
assertEquals(expName, dd.getName());
|
||||
assertEquals(expTitle, dd.getTitle());
|
||||
assertTrue(dd.getBody().indexOf(expBody) != -1);
|
||||
assertEquals(expDate, dd.getDate());
|
||||
Date date = dd.getDate() != null ? DateTools.stringToDate(dd.getDate()) : null;
|
||||
assertEquals(expDate, date);
|
||||
}
|
||||
|
||||
private void assertNoMoreDataException(StringableTrecDocMaker stdm) throws Exception {
|
||||
private void assertNoMoreDataException(StringableTrecSource stdm) throws Exception {
|
||||
boolean thrown = false;
|
||||
try {
|
||||
stdm.getNextDocData();
|
||||
stdm.getNextDocData(null);
|
||||
} catch (NoMoreDataException e) {
|
||||
thrown = true;
|
||||
}
|
||||
|
@ -93,14 +104,14 @@ public class TrecDocMakerTest extends TestCase {
|
|||
"</body>\r\n" +
|
||||
"\r\n" +
|
||||
"</DOC>";
|
||||
StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
|
||||
stdm.setHTMLParser(new DemoHTMLParser());
|
||||
|
||||
DocData dd = stdm.getNextDocData();
|
||||
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
|
||||
StringableTrecSource source = new StringableTrecSource(docs, false);
|
||||
source.setConfig(null);
|
||||
|
||||
DocData dd = source.getNextDocData(new DocData());
|
||||
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
|
||||
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
|
||||
|
||||
assertNoMoreDataException(stdm);
|
||||
assertNoMoreDataException(source);
|
||||
}
|
||||
|
||||
public void testTwoDocuments() throws Exception {
|
||||
|
@ -156,18 +167,18 @@ public class TrecDocMakerTest extends TestCase {
|
|||
"</body>\r\n" +
|
||||
"\r\n" +
|
||||
"</DOC>";
|
||||
StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
|
||||
stdm.setHTMLParser(new DemoHTMLParser());
|
||||
|
||||
DocData dd = stdm.getNextDocData();
|
||||
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
|
||||
StringableTrecSource source = new StringableTrecSource(docs, false);
|
||||
source.setConfig(null);
|
||||
|
||||
DocData dd = source.getNextDocData(new DocData());
|
||||
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
|
||||
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
|
||||
|
||||
dd = stdm.getNextDocData();
|
||||
assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm
|
||||
dd = source.getNextDocData(dd);
|
||||
assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
|
||||
.parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
|
||||
|
||||
assertNoMoreDataException(stdm);
|
||||
assertNoMoreDataException(source);
|
||||
}
|
||||
|
||||
// If a Date: attribute is missing, make sure the document is not skipped, but
|
||||
|
@ -224,17 +235,17 @@ public class TrecDocMakerTest extends TestCase {
|
|||
"</body>\r\n" +
|
||||
"\r\n" +
|
||||
"</DOC>";
|
||||
StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
|
||||
stdm.setHTMLParser(new DemoHTMLParser());
|
||||
StringableTrecSource source = new StringableTrecSource(docs, false);
|
||||
source.setConfig(null);
|
||||
|
||||
DocData dd = stdm.getNextDocData();
|
||||
DocData dd = source.getNextDocData(new DocData());
|
||||
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
|
||||
|
||||
dd = stdm.getNextDocData();
|
||||
assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", stdm
|
||||
dd = source.getNextDocData(dd);
|
||||
assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source
|
||||
.parseDate("Sun, 11 Jan 2009 08:01:00 GMT"));
|
||||
|
||||
assertNoMoreDataException(stdm);
|
||||
assertNoMoreDataException(source);
|
||||
}
|
||||
|
||||
// When a 'bad date' is input (unparsable date), make sure the DocData date is
|
||||
|
@ -266,13 +277,13 @@ public class TrecDocMakerTest extends TestCase {
|
|||
"</body>\r\n" +
|
||||
"\r\n" +
|
||||
"</DOC>";
|
||||
StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, false);
|
||||
stdm.setHTMLParser(new DemoHTMLParser());
|
||||
StringableTrecSource source = new StringableTrecSource(docs, false);
|
||||
source.setConfig(null);
|
||||
|
||||
DocData dd = stdm.getNextDocData();
|
||||
DocData dd = source.getNextDocData(new DocData());
|
||||
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null);
|
||||
|
||||
assertNoMoreDataException(stdm);
|
||||
assertNoMoreDataException(source);
|
||||
}
|
||||
|
||||
public void testForever() throws Exception {
|
||||
|
@ -302,16 +313,16 @@ public class TrecDocMakerTest extends TestCase {
|
|||
"</body>\r\n" +
|
||||
"\r\n" +
|
||||
"</DOC>";
|
||||
StringableTrecDocMaker stdm = new StringableTrecDocMaker(docs, true);
|
||||
stdm.setHTMLParser(new DemoHTMLParser());
|
||||
StringableTrecSource source = new StringableTrecSource(docs, true);
|
||||
source.setConfig(null);
|
||||
|
||||
DocData dd = stdm.getNextDocData();
|
||||
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", stdm
|
||||
DocData dd = source.getNextDocData(new DocData());
|
||||
assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source
|
||||
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
|
||||
|
||||
// same document, but the second iteration changes the name.
|
||||
dd = stdm.getNextDocData();
|
||||
assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", stdm
|
||||
dd = source.getNextDocData(dd);
|
||||
assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source
|
||||
.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
|
||||
|
||||
// Don't test that NoMoreDataException is thrown, since the forever flag is
|
|
@ -27,8 +27,8 @@ import java.util.Properties;
|
|||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -40,7 +40,7 @@ import org.apache.lucene.document.Field.Store;
|
|||
public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
||||
|
||||
// class has to be public so that Class.forName.newInstance() will work
|
||||
public static final class WriteLineDocMaker extends BasicDocMaker {
|
||||
public static final class WriteLineDocMaker extends DocMaker {
|
||||
|
||||
protected DocData getNextDocData() throws NoMoreDataException, Exception {
|
||||
throw new UnsupportedOperationException("not implemented");
|
||||
|
|
|
@ -23,7 +23,7 @@ import java.io.FileReader;
|
|||
import java.io.PrintWriter;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.TestPerfTasksLogic;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource;
|
||||
import org.apache.lucene.benchmark.quality.Judge;
|
||||
import org.apache.lucene.benchmark.quality.QualityQuery;
|
||||
import org.apache.lucene.benchmark.quality.QualityQueryParser;
|
||||
|
@ -155,10 +155,10 @@ public class TestQualityRun extends TestCase {
|
|||
// 1. alg definition
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker="+ReutersDocMaker.class.getName(),
|
||||
"doc.add.log.step=2500",
|
||||
"content.source="+ReutersContentSource.class.getName(),
|
||||
"content.source.log.step=2500",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"content.source.forever=false",
|
||||
"directory=FSDirectory",
|
||||
"doc.stored=true",
|
||||
"doc.tokenized=true",
|
||||
|
|
Loading…
Reference in New Issue