mirror of https://github.com/apache/lucene.git
LUCENE-967: add ReadTokensTask to allow for benchmarking just tokenization
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@561908 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
60c6a847b5
commit
0fd867732e
|
@ -4,6 +4,9 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
|||
|
||||
$Id:$
|
||||
|
||||
8/1/07
|
||||
LUCENE-967: Add "ReadTokensTask" to allow for benchmarking just tokenization.
|
||||
|
||||
7/27/07
|
||||
LUCENE-836: Add support for search quality benchmarking, running
|
||||
a set of queries against a searcher, and, optionally produce a submission
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
#/**
|
||||
# * Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# * contributor license agreements. See the NOTICE file distributed with
|
||||
# * this work for additional information regarding copyright ownership.
|
||||
# * The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# * (the "License"); you may not use this file except in compliance with
|
||||
# * the License. You may obtain a copy of the License at
|
||||
# *
|
||||
# * http://www.apache.org/licenses/LICENSE-2.0
|
||||
# *
|
||||
# * Unless required by applicable law or agreed to in writing, software
|
||||
# * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# * See the License for the specific language governing permissions and
|
||||
# * limitations under the License.
|
||||
# */
|
||||
# -------------------------------------------------------------------------------------
|
||||
|
||||
#
|
||||
# This alg reads all tokens out of a document but does not index them.
|
||||
# This is useful for benchmarking tokenizers.
|
||||
#
|
||||
# To use this, cd to contrib/benchmark and then run:
|
||||
#
|
||||
# ant run-task -Dtask.alg=conf/tokenize.alg
|
||||
#
|
||||
|
||||
doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
|
||||
doc.maker.forever=false
|
||||
|
||||
|
||||
#
|
||||
-------------------------------------------------------------------------------------
|
||||
|
||||
{ReadTokens > : *
|
||||
RepSumByName
|
|
@ -69,6 +69,7 @@ public class Benchmark {
|
|||
throw new IllegalStateException("Benchmark was already executed");
|
||||
}
|
||||
executed = true;
|
||||
runData.setStartTimeMillis();
|
||||
algorithm.execute();
|
||||
}
|
||||
|
||||
|
|
|
@ -137,8 +137,11 @@ public class PerfRunData {
|
|||
// release unused stuff
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
|
||||
}
|
||||
|
||||
public long setStartTimeMillis() {
|
||||
startTimeMillis = System.currentTimeMillis();
|
||||
return startTimeMillis;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -534,6 +534,7 @@ Here is a list of currently defined properties:
|
|||
</li><li>doc.delete.log.step
|
||||
</li><li>log.queries
|
||||
</li><li>task.max.depth.log
|
||||
</li><li>doc.tokenize.log.step
|
||||
</li></ul>
|
||||
</li>
|
||||
|
||||
|
|
|
@ -78,7 +78,7 @@ public abstract class PerfTask implements Cloneable {
|
|||
if (reportStats && depth <= maxDepthLogStart && !shouldNeverLogAtStart()) {
|
||||
System.out.println("------------> starting task: " + getName());
|
||||
}
|
||||
if (shouldNotRecordStats() || !reportStats) {
|
||||
if (!reportStats || shouldNotRecordStats()) {
|
||||
setup();
|
||||
int count = doLogic();
|
||||
tearDown();
|
||||
|
|
|
@ -0,0 +1,168 @@
|
|||
package org.apache.lucene.benchmark.byTask.tasks;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import java.text.NumberFormat;
|
||||
import java.io.Reader;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Simple task to test performance of tokenizers. It just
|
||||
* creates a token stream for each field of the document and
|
||||
* read all tokens out of that stream.
|
||||
* <br>Relevant properties: <code>doc.tokenize.log.step</code>.
|
||||
*/
|
||||
public class ReadTokensTask extends PerfTask {
|
||||
|
||||
/**
|
||||
* Default value for property <code>doc.tokenize.log.step<code> - indicating how often
|
||||
* an "added N docs / M tokens" message should be logged.
|
||||
*/
|
||||
public static final int DEFAULT_DOC_LOG_STEP = 500;
|
||||
|
||||
public ReadTokensTask(PerfRunData runData) {
|
||||
super(runData);
|
||||
}
|
||||
|
||||
private int logStep = -1;
|
||||
int count = 0;
|
||||
int totalTokenCount = 0;
|
||||
|
||||
// volatile data passed between setup(), doLogic(), tearDown().
|
||||
private Document doc = null;
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see PerfTask#setup()
|
||||
*/
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
DocMaker docMaker = getRunData().getDocMaker();
|
||||
doc = docMaker.makeDocument();
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see PerfTask#tearDown()
|
||||
*/
|
||||
public void tearDown() throws Exception {
|
||||
log(++count);
|
||||
doc = null;
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
Token token = new Token("", 0, 0);
|
||||
|
||||
public int doLogic() throws Exception {
|
||||
List fields = doc.getFields();
|
||||
final int numField = fields.size();
|
||||
Analyzer analyzer = getRunData().getAnalyzer();
|
||||
int tokenCount = 0;
|
||||
for(int i=0;i<numField;i++) {
|
||||
final Field field = (Field) fields.get(i);
|
||||
final TokenStream stream;
|
||||
final TokenStream streamValue = field.tokenStreamValue();
|
||||
|
||||
if (streamValue != null)
|
||||
stream = streamValue;
|
||||
else {
|
||||
// the field does not have a TokenStream,
|
||||
// so we have to obtain one from the analyzer
|
||||
final Reader reader; // find or make Reader
|
||||
final Reader readerValue = field.readerValue();
|
||||
|
||||
if (readerValue != null)
|
||||
reader = readerValue;
|
||||
else {
|
||||
String stringValue = field.stringValue();
|
||||
if (stringValue == null)
|
||||
throw new IllegalArgumentException("field must have either TokenStream, String or Reader value");
|
||||
stringReader.init(stringValue);
|
||||
reader = stringReader;
|
||||
}
|
||||
|
||||
// Tokenize field
|
||||
stream = analyzer.tokenStream(field.name(), reader);
|
||||
}
|
||||
|
||||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
|
||||
while(stream.next() != null)
|
||||
tokenCount++;
|
||||
}
|
||||
totalTokenCount += tokenCount;
|
||||
return tokenCount;
|
||||
}
|
||||
|
||||
private void log(int count) {
|
||||
if (logStep<0) {
|
||||
// init once per instance
|
||||
logStep = getRunData().getConfig().get("doc.tokenize.log.step", DEFAULT_DOC_LOG_STEP);
|
||||
}
|
||||
if (logStep>0 && (count%logStep)==0) {
|
||||
double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
|
||||
NumberFormat nf = NumberFormat.getInstance();
|
||||
nf.setMaximumFractionDigits(2);
|
||||
System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs" + "; " + totalTokenCount + " tokens");
|
||||
}
|
||||
}
|
||||
|
||||
/* Simple StringReader that can be reset to a new string;
|
||||
* we use this when tokenizing the string value from a
|
||||
* Field. */
|
||||
ReusableStringReader stringReader = new ReusableStringReader();
|
||||
|
||||
private final static class ReusableStringReader extends Reader {
|
||||
int upto;
|
||||
int left;
|
||||
String s;
|
||||
void init(String s) {
|
||||
this.s = s;
|
||||
left = s.length();
|
||||
this.upto = 0;
|
||||
}
|
||||
public int read(char[] c) {
|
||||
return read(c, 0, c.length);
|
||||
}
|
||||
public int read(char[] c, int off, int len) {
|
||||
if (left > len) {
|
||||
s.getChars(upto, upto+len, c, off);
|
||||
upto += len;
|
||||
left -= len;
|
||||
return len;
|
||||
} else if (0 == left) {
|
||||
return -1;
|
||||
} else {
|
||||
s.getChars(upto, upto+left, c, off);
|
||||
int r = left;
|
||||
left = 0;
|
||||
upto = s.length();
|
||||
return r;
|
||||
}
|
||||
}
|
||||
public void close() {};
|
||||
}
|
||||
}
|
|
@ -100,15 +100,19 @@ public class TaskSequence extends PerfTask {
|
|||
|
||||
int count = 0;
|
||||
boolean exhausted = false;
|
||||
|
||||
final int numTasks = tasks.size();
|
||||
final PerfTask[] tasksArray = new PerfTask[numTasks];
|
||||
for(int k=0;k<numTasks;k++)
|
||||
tasksArray[k] = (PerfTask) tasks.get(k);
|
||||
|
||||
for (int k=0; (repetitions==REPEAT_EXHAUST && !exhausted) || k<repetitions; k++) {
|
||||
for (Iterator it = tasks.iterator(); it.hasNext();) {
|
||||
PerfTask task = (PerfTask) it.next();
|
||||
for(int l=0;l<numTasks;l++)
|
||||
try {
|
||||
count += task.runAndMaybeStats(letChildReport);
|
||||
count += tasksArray[l].runAndMaybeStats(letChildReport);
|
||||
} catch (NoMoreDataException e) {
|
||||
exhausted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
|
|
@ -21,14 +21,19 @@ import java.io.StringReader;
|
|||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.BufferedReader;
|
||||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.Benchmark;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
|
||||
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
@ -223,6 +228,60 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
lineFile.delete();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test ReadTokensTask
|
||||
*/
|
||||
public void testReadTokens() throws Exception {
|
||||
|
||||
// We will call ReadTokens on this many docs
|
||||
final int NUM_DOCS = 100;
|
||||
|
||||
// Read tokens from first NUM_DOCS docs from Reuters and
|
||||
// then build index from the same docs
|
||||
String algLines1[] = {
|
||||
"# ----- properties ",
|
||||
"analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer",
|
||||
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
|
||||
"# ----- alg ",
|
||||
"{ReadTokens}: " + NUM_DOCS,
|
||||
"ResetSystemErase",
|
||||
"CreateIndex",
|
||||
"{AddDoc}: " + NUM_DOCS,
|
||||
"CloseIndex",
|
||||
};
|
||||
|
||||
// Run algo
|
||||
Benchmark benchmark = execBenchmark(algLines1);
|
||||
|
||||
List stats = benchmark.getRunData().getPoints().taskStats();
|
||||
|
||||
// Count how many tokens all ReadTokens saw
|
||||
int totalTokenCount1 = 0;
|
||||
for (Iterator it = stats.iterator(); it.hasNext();) {
|
||||
TaskStats stat = (TaskStats) it.next();
|
||||
if (stat.getTask().getName().equals("ReadTokens")) {
|
||||
totalTokenCount1 += stat.getCount();
|
||||
}
|
||||
}
|
||||
|
||||
// Separately count how many tokens are actually in the index:
|
||||
IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
assertEquals(NUM_DOCS, reader.numDocs());
|
||||
|
||||
TermEnum terms = reader.terms();
|
||||
TermDocs termDocs = reader.termDocs();
|
||||
int totalTokenCount2 = 0;
|
||||
while(terms.next()) {
|
||||
termDocs.seek(terms.term());
|
||||
while(termDocs.next())
|
||||
totalTokenCount2 += termDocs.freq();
|
||||
}
|
||||
reader.close();
|
||||
|
||||
// Make sure they are the same
|
||||
assertEquals(totalTokenCount1, totalTokenCount2);
|
||||
}
|
||||
|
||||
// create the benchmark and execute it.
|
||||
public static Benchmark execBenchmark(String[] algLines) throws Exception {
|
||||
String algText = algLinesToText(algLines);
|
||||
|
|
Loading…
Reference in New Issue