LUCENE-967: add ReadTokensTask to allow for benchmarking just tokenization

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@561908 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2007-08-01 18:54:43 +00:00
parent 60c6a847b5
commit 0fd867732e
9 changed files with 281 additions and 6 deletions

View File

@ -4,6 +4,9 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
8/1/07
LUCENE-967: Add "ReadTokensTask" to allow for benchmarking just tokenization.
7/27/07
LUCENE-836: Add support for search quality benchmarking, running
a set of queries against a searcher, and, optionally produce a submission

View File

@ -0,0 +1,36 @@
#/**
# * Licensed to the Apache Software Foundation (ASF) under one or more
# * contributor license agreements. See the NOTICE file distributed with
# * this work for additional information regarding copyright ownership.
# * The ASF licenses this file to You under the Apache License, Version 2.0
# * (the "License"); you may not use this file except in compliance with
# * the License. You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# */
# -------------------------------------------------------------------------------------
#
# This alg reads all tokens out of a document but does not index them.
# This is useful for benchmarking tokenizers.
#
# To use this, cd to contrib/benchmark and then run:
#
# ant run-task -Dtask.alg=conf/tokenize.alg
#
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
doc.maker.forever=false
#
-------------------------------------------------------------------------------------
{ReadTokens > : *
RepSumByName

View File

@ -69,6 +69,7 @@ public class Benchmark {
throw new IllegalStateException("Benchmark was already executed");
}
executed = true;
runData.setStartTimeMillis();
algorithm.execute();
}

View File

@ -137,8 +137,11 @@ public class PerfRunData {
// release unused stuff
System.runFinalization();
System.gc();
}
public long setStartTimeMillis() {
startTimeMillis = System.currentTimeMillis();
return startTimeMillis;
}
/**

View File

@ -534,6 +534,7 @@ Here is a list of currently defined properties:
</li><li>doc.delete.log.step
</li><li>log.queries
</li><li>task.max.depth.log
</li><li>doc.tokenize.log.step
</li></ul>
</li>

View File

@ -78,7 +78,7 @@ public abstract class PerfTask implements Cloneable {
if (reportStats && depth <= maxDepthLogStart && !shouldNeverLogAtStart()) {
System.out.println("------------> starting task: " + getName());
}
if (shouldNotRecordStats() || !reportStats) {
if (!reportStats || shouldNotRecordStats()) {
setup();
int count = doLogic();
tearDown();

View File

@ -0,0 +1,168 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.text.NumberFormat;
import java.io.Reader;
import java.util.List;
/**
* Simple task to test performance of tokenizers. It just
* creates a token stream for each field of the document and
* read all tokens out of that stream.
* <br>Relevant properties: <code>doc.tokenize.log.step</code>.
*/
public class ReadTokensTask extends PerfTask {
/**
* Default value for property <code>doc.tokenize.log.step<code> - indicating how often
* an "added N docs / M tokens" message should be logged.
*/
public static final int DEFAULT_DOC_LOG_STEP = 500;
public ReadTokensTask(PerfRunData runData) {
super(runData);
}
private int logStep = -1;
int count = 0;
int totalTokenCount = 0;
// volatile data passed between setup(), doLogic(), tearDown().
private Document doc = null;
/*
* (non-Javadoc)
* @see PerfTask#setup()
*/
public void setup() throws Exception {
super.setup();
DocMaker docMaker = getRunData().getDocMaker();
doc = docMaker.makeDocument();
}
/* (non-Javadoc)
* @see PerfTask#tearDown()
*/
public void tearDown() throws Exception {
log(++count);
doc = null;
super.tearDown();
}
Token token = new Token("", 0, 0);
public int doLogic() throws Exception {
List fields = doc.getFields();
final int numField = fields.size();
Analyzer analyzer = getRunData().getAnalyzer();
int tokenCount = 0;
for(int i=0;i<numField;i++) {
final Field field = (Field) fields.get(i);
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();
if (streamValue != null)
stream = streamValue;
else {
// the field does not have a TokenStream,
// so we have to obtain one from the analyzer
final Reader reader; // find or make Reader
final Reader readerValue = field.readerValue();
if (readerValue != null)
reader = readerValue;
else {
String stringValue = field.stringValue();
if (stringValue == null)
throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
stringReader.init(stringValue);
reader = stringReader;
}
// Tokenize field
stream = analyzer.tokenStream(field.name(), reader);
}
// reset the TokenStream to the first token
stream.reset();
while(stream.next() != null)
tokenCount++;
}
totalTokenCount += tokenCount;
return tokenCount;
}
private void log(int count) {
if (logStep<0) {
// init once per instance
logStep = getRunData().getConfig().get("doc.tokenize.log.step", DEFAULT_DOC_LOG_STEP);
}
if (logStep>0 && (count%logStep)==0) {
double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
NumberFormat nf = NumberFormat.getInstance();
nf.setMaximumFractionDigits(2);
System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs" + "; " + totalTokenCount + " tokens");
}
}
/* Simple StringReader that can be reset to a new string;
* we use this when tokenizing the string value from a
* Field. */
ReusableStringReader stringReader = new ReusableStringReader();
private final static class ReusableStringReader extends Reader {
int upto;
int left;
String s;
void init(String s) {
this.s = s;
left = s.length();
this.upto = 0;
}
public int read(char[] c) {
return read(c, 0, c.length);
}
public int read(char[] c, int off, int len) {
if (left > len) {
s.getChars(upto, upto+len, c, off);
upto += len;
left -= len;
return len;
} else if (0 == left) {
return -1;
} else {
s.getChars(upto, upto+left, c, off);
int r = left;
left = 0;
upto = s.length();
return r;
}
}
public void close() {};
}
}

View File

@ -100,15 +100,19 @@ public class TaskSequence extends PerfTask {
int count = 0;
boolean exhausted = false;
final int numTasks = tasks.size();
final PerfTask[] tasksArray = new PerfTask[numTasks];
for(int k=0;k<numTasks;k++)
tasksArray[k] = (PerfTask) tasks.get(k);
for (int k=0; (repetitions==REPEAT_EXHAUST && !exhausted) || k<repetitions; k++) {
for (Iterator it = tasks.iterator(); it.hasNext();) {
PerfTask task = (PerfTask) it.next();
for(int l=0;l<numTasks;l++)
try {
count += task.runAndMaybeStats(letChildReport);
count += tasksArray[l].runAndMaybeStats(letChildReport);
} catch (NoMoreDataException e) {
exhausted = true;
}
}
}
return count;
}

View File

@ -21,14 +21,19 @@ import java.io.StringReader;
import java.io.File;
import java.io.FileReader;
import java.io.BufferedReader;
import java.util.List;
import java.util.Iterator;
import org.apache.lucene.benchmark.byTask.Benchmark;
import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import junit.framework.TestCase;
@ -223,6 +228,60 @@ public class TestPerfTasksLogic extends TestCase {
lineFile.delete();
}
/**
* Test ReadTokensTask
*/
public void testReadTokens() throws Exception {
// We will call ReadTokens on this many docs
final int NUM_DOCS = 100;
// Read tokens from first NUM_DOCS docs from Reuters and
// then build index from the same docs
String algLines1[] = {
"# ----- properties ",
"analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
"# ----- alg ",
"{ReadTokens}: " + NUM_DOCS,
"ResetSystemErase",
"CreateIndex",
"{AddDoc}: " + NUM_DOCS,
"CloseIndex",
};
// Run algo
Benchmark benchmark = execBenchmark(algLines1);
List stats = benchmark.getRunData().getPoints().taskStats();
// Count how many tokens all ReadTokens saw
int totalTokenCount1 = 0;
for (Iterator it = stats.iterator(); it.hasNext();) {
TaskStats stat = (TaskStats) it.next();
if (stat.getTask().getName().equals("ReadTokens")) {
totalTokenCount1 += stat.getCount();
}
}
// Separately count how many tokens are actually in the index:
IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory());
assertEquals(NUM_DOCS, reader.numDocs());
TermEnum terms = reader.terms();
TermDocs termDocs = reader.termDocs();
int totalTokenCount2 = 0;
while(terms.next()) {
termDocs.seek(terms.term());
while(termDocs.next())
totalTokenCount2 += termDocs.freq();
}
reader.close();
// Make sure they are the same
assertEquals(totalTokenCount1, totalTokenCount2);
}
// create the benchmark and execute it.
public static Benchmark execBenchmark(String[] algLines) throws Exception {
String algText = algLinesToText(algLines);