Commit benchmark code. Thanks to Doron Cohen for updates

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@475222 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2006-11-15 13:09:33 +00:00
parent 5627c6201c
commit 423c1b4c9c
17 changed files with 2144 additions and 0 deletions

View File

@ -0,0 +1,19 @@
<?xml version="1.0"?>
<!--
Configuration file for use with Driver(InputSource) constructor. Uses Digester to load. Set
Class is the fully qualified name of the Benchmarker to load. It must have a no argument constructor. All attributes invoke the appropriate bean accessor on the Benchmarker -->
<benchmark >
<benchmarker class="org.apache.lucene.benchmark.standard.StandardBenchmarker"/>
<!-- Attributes map to bean getters/setters on the specific instantiation of StandardOptions. So, if you implement your own options, then you can
initialize automatically by setting the bean attributes here.
All attributes are optional:
runCount - The number of times to run the benchmark successively. Default: 5
logStep - When indexing, how often to output how many documents have been processed. Default: 1000
scaleUp - How many times to add the same documents. Default: 5
maximumDocumentsToIndex - The number of documents to index at a time (multiply by the scaleUp factor for the total number of documents indexed). Default is Integer.MAX_VALUE
-->
<options class="org.apache.lucene.benchmark.standard.StandardOptions" runCount="1" logStep="500" scaleUp="1" maximumDocumentsToIndex="2000"/>
</benchmark>

View File

@ -0,0 +1,19 @@
<?xml version="1.0"?>
<!--
Configuration file for use with Driver(InputSource) constructor. Uses Digester to load. Set
Class is the fully qualified name of the Benchmarker to load. It must have a no argument constructor. All attributes invoke the appropriate bean accessor on the Benchmarker -->
<benchmark >
<benchmarker class="org.apache.lucene.benchmark.standard.StandardBenchmarker"/>
<!-- Attributes map to bean getters/setters on the specific instantiation of StandardOptions. So, if you implement your own options, then you can
initialize automatically by setting the bean attributes here.
All attributes are optional.
runCount - The number of times to run the benchmark successively. Default: 5
logStep - When indexing, how often to output how many documents have been processed. Default: 1000
scaleUp - How many times to add the same documents. Default: 5
maximumDocumentsToIndex - The number of documents to index at a time (multiply by the scaleUp factor for the total number of documents indexed). Default is Integer.MAX_VALUE
-->
<options class="org.apache.lucene.benchmark.standard.StandardOptions" runCount="5" logStep="1000" scaleUp="5"/>
</benchmark>

View File

@ -0,0 +1,61 @@
package org.apache.lucene.benchmark;
import java.io.File;
import java.io.IOException;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*
**/
public abstract class AbstractBenchmarker implements Benchmarker
{
/**
* Delete files and directories, even if non-empty.
*
* @param dir file or directory
* @return true on success, false if no or part of files have been deleted
* @throws java.io.IOException
*/
public static boolean fullyDelete(File dir) throws IOException
{
if (dir == null || !dir.exists()) return false;
File contents[] = dir.listFiles();
if (contents != null)
{
for (int i = 0; i < contents.length; i++)
{
if (contents[i].isFile())
{
if (!contents[i].delete())
{
return false;
}
}
else
{
if (!fullyDelete(contents[i]))
{
return false;
}
}
}
}
return dir.delete();
}
}

View File

@ -0,0 +1,29 @@
package org.apache.lucene.benchmark;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Marker Interface defining some common options. Implementations should define their own set of options that can be
* cast to in the {@link Benchmarker} interface.
* <p/>
* As benchmarks are added, perhaps a common set of Options will become clear
*
*
**/
public interface BenchmarkOptions
{
}

View File

@ -0,0 +1,39 @@
package org.apache.lucene.benchmark;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import org.apache.lucene.benchmark.stats.TestData;
/**
*
*
**/
public interface Benchmarker
{
/**
* Benchmark according to the implementation, using the workingDir as the place to store things.
*
* @param workingDir The {@link java.io.File} directory to store temporary data in for running the benchmark
* @param options Any {@link BenchmarkOptions} that are needed for this benchmark. This
* @return The {@link org.apache.lucene.benchmark.stats.TestData} used to run the benchmark.
*/
TestData[] benchmark(File workingDir, BenchmarkOptions options) throws Exception;
}

View File

@ -0,0 +1,33 @@
package org.apache.lucene.benchmark;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*
**/
public class Constants
{
public static final int DEFAULT_RUN_COUNT = 5;
public static final int DEFAULT_SCALE_UP = 5;
public static final int DEFAULT_LOG_STEP = 1000;
public static Boolean[] BOOLEANS = new Boolean[] { Boolean.FALSE, Boolean.TRUE };
public static final int DEFAULT_MAXIMUM_DOCUMENTS = Integer.MAX_VALUE;
}

View File

@ -0,0 +1,145 @@
package org.apache.lucene.benchmark;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import org.apache.commons.digester.Digester;
import org.apache.lucene.benchmark.standard.StandardBenchmarker;
import org.apache.lucene.benchmark.stats.TestData;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Sets up the
*
**/
public class Driver
{
private File workingDir;
private Benchmarker benchmarker;
private BenchmarkOptions options;
public Driver()
{
}
public Driver(Benchmarker benchmarker, BenchmarkOptions options)
{
this.benchmarker = benchmarker;
this.options = options;
}
/**
* Creates a Driver using Digester
* @param inputSource
*/
public Driver(File workingDir, InputSource inputSource) throws IOException, SAXException
{
Digester digester = new Digester();
digester.setValidating(false);
digester.addObjectCreate("benchmark/benchmarker", "class", StandardBenchmarker.class);
digester.addSetProperties("benchmark/benchmarker");
digester.addSetNext("benchmark/benchmarker", "setBenchmarker");
digester.addObjectCreate("benchmark/options", "class", BenchmarkOptions.class);
digester.addSetProperties("benchmark/options");
digester.addSetNext("benchmark/options", "setOptions");
digester.push(this);
digester.parse(inputSource);
this.workingDir = workingDir;
}
private void run() throws Exception
{
TestData [] data = benchmarker.benchmark(workingDir, options);
//Print out summary:
/*System.out.println("Test Data:");
for (int i = 0; i < data.length; i++)
{
TestData testData = data[i];
System.out.println("---------------");
System.out.println(testData.showRunData(testData.getId()));
System.out.println("---------------");
}*/
}
public Benchmarker getBenchmarker()
{
return benchmarker;
}
public void setBenchmarker(Benchmarker benchmarker)
{
this.benchmarker = benchmarker;
}
public BenchmarkOptions getOptions()
{
return options;
}
public void setOptions(BenchmarkOptions options)
{
this.options = options;
}
public File getWorkingDir()
{
return workingDir;
}
public void setWorkingDir(File workingDir)
{
this.workingDir = workingDir;
}
public static void main(String[] args)
{
if (args.length != 2)
{
printHelp(args);
System.exit(0);
}
File workingDir = new File(args[0]);
File configFile = new File(args[1]);
if (configFile.exists())
{
//Setup
try
{
Driver driver = new Driver(workingDir, new InputSource(new FileReader(configFile)));
driver.run();
}
catch (Exception e)
{
e.printStackTrace(System.err);
}
}
}
private static void printHelp(String[] args)
{
System.out.println("Usage: java -cp [...] " + Driver.class.getName() + "<working dir> <config-file>");
}
}

View File

@ -0,0 +1,59 @@
package org.apache.lucene.benchmark.standard;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanFirstQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.index.Term;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*
**/
public class ReutersQueries
{
public static String [] STANDARD_QUERIES = {
//Start with some short queries
"Salomon", "Comex", "night trading", "Japan Sony",
//Try some Phrase Queries
"\"Sony Japan\"", "\"food needs\"~3",
"\"World Bank\"^2 AND Nigeria", "\"World Bank\" -Nigeria",
"\"Ford Credit\"~5",
//Try some longer queries
"airline Europe Canada destination",
"Long term pressure by trade " +
"ministers is necessary if the current Uruguay round of talks on " +
"the General Agreement on Trade and Tariffs (GATT) is to " +
"succeed"
};
public static Query[] getPrebuiltQueries(String field)
{
//be wary of unanalyzed text
return new Query[]{
new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 5),
new SpanNearQuery(new SpanQuery[]{new SpanTermQuery(new Term(field, "night")), new SpanTermQuery(new Term(field, "trading"))}, 4, false),
new SpanNearQuery(new SpanQuery[]{new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 10), new SpanTermQuery(new Term(field, "credit"))}, 10, false),
new WildcardQuery(new Term(field, "fo*")),
};
}
}

View File

@ -0,0 +1,460 @@
package org.apache.lucene.benchmark.standard;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.benchmark.AbstractBenchmarker;
import org.apache.lucene.benchmark.BenchmarkOptions;
import org.apache.lucene.benchmark.Benchmarker;
import org.apache.lucene.benchmark.stats.QueryData;
import org.apache.lucene.benchmark.stats.TestData;
import org.apache.lucene.benchmark.stats.TestRunData;
import org.apache.lucene.benchmark.stats.TimeData;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.FSDirectory;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Reads in the Reuters Collection, downloaded from http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz
* in the workingDir/reuters and indexes them using the {@link org.apache.lucene.analysis.standard.StandardAnalyzer}
*<p/>
* Runs a standard set of documents through an Indexer and then runs a standard set of queries against the index.
*
* @see org.apache.lucene.benchmark.standard.StandardBenchmarker#benchmark(java.io.File, org.apache.lucene.benchmark.BenchmarkOptions)
*
*
**/
public class StandardBenchmarker extends AbstractBenchmarker implements Benchmarker
{
public static final String SOURCE_DIR = "reuters-out";
public static final String INDEX_DIR = "index";
//30-MAR-1987 14:22:36.87
private static DateFormat format = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS");
//DateFormat.getDateTimeInstance(DateFormat.MEDIUM, DateFormat.SHORT);
static{
format.setLenient(true);
}
public StandardBenchmarker()
{
}
public TestData [] benchmark(File workingDir, BenchmarkOptions opts) throws Exception
{
StandardOptions options = (StandardOptions) opts;
workingDir.mkdirs();
File sourceDir = getSourceDirectory(workingDir);
sourceDir.mkdirs();
File indexDir = new File(workingDir, INDEX_DIR);
indexDir.mkdirs();
Analyzer a = new StandardAnalyzer();
List queryList = new ArrayList(20);
queryList.addAll(Arrays.asList(ReutersQueries.STANDARD_QUERIES));
queryList.addAll(Arrays.asList(ReutersQueries.getPrebuiltQueries("body")));
Query[] qs = createQueries(queryList, a);
// Here you can limit the set of query benchmarks
QueryData[] qds = QueryData.getAll(qs);
// Here you can narrow down the set of test parameters
TestData[] params = TestData.getTestDataMinMaxMergeAndMaxBuffered(new File[]{sourceDir/*, jumboDir*/}, new Analyzer[]{a});//TestData.getAll(new File[]{sourceDir, jumboDir}, new Analyzer[]{a});
System.out.println("Testing " + params.length + " different permutations.");
for (int i = 0; i < params.length; i++)
{
try
{
reset(indexDir);
params[i].setDirectory(FSDirectory.getDirectory(indexDir, true));
params[i].setQueries(qds);
System.out.println(params[i]);
runBenchmark(params[i], options);
// Here you can collect and output the runData for further processing.
System.out.println(params[i].showRunData(params[i].getId()));
//bench.runSearchBenchmark(queries, dir);
params[i].getDirectory().close();
System.runFinalization();
System.gc();
}
catch (Exception e)
{
e.printStackTrace();
System.out.println("EXCEPTION: " + e.getMessage());
//break;
}
}
return params;
}
protected File getSourceDirectory(File workingDir)
{
return new File(workingDir, SOURCE_DIR);
}
/**
* Run benchmark using supplied parameters.
*
* @param params benchmark parameters
* @throws Exception
*/
protected void runBenchmark(TestData params, StandardOptions options) throws Exception
{
System.out.println("Start Time: " + new Date());
int runCount = options.getRunCount();
for (int i = 0; i < runCount; i++)
{
TestRunData trd = new TestRunData();
trd.startRun();
trd.setId(String.valueOf(i));
IndexWriter iw = new IndexWriter(params.getDirectory(), params.getAnalyzer(), true);
iw.setMergeFactor(params.getMergeFactor());
iw.setMaxBufferedDocs(params.getMaxBufferedDocs());
iw.setUseCompoundFile(params.isCompound());
makeIndex(trd, params.getSource(), iw, true, true, false, options);
if (params.isOptimize())
{
TimeData td = new TimeData("optimize");
trd.addData(td);
td.start();
iw.optimize();
td.stop();
trd.addData(td);
}
iw.close();
QueryData[] queries = params.getQueries();
if (queries != null)
{
IndexReader ir = null;
IndexSearcher searcher = null;
for (int k = 0; k < queries.length; k++)
{
QueryData qd = queries[k];
if (ir != null && qd.reopen)
{
searcher.close();
ir.close();
ir = null;
searcher = null;
}
if (ir == null)
{
ir = IndexReader.open(params.getDirectory());
searcher = new IndexSearcher(ir);
}
Document doc = null;
if (qd.warmup)
{
TimeData td = new TimeData(qd.id + "-warm");
for (int m = 0; m < ir.maxDoc(); m++)
{
td.start();
if (ir.isDeleted(m))
{
td.stop();
continue;
}
doc = ir.document(m);
td.stop();
}
trd.addData(td);
}
TimeData td = new TimeData(qd.id + "-srch");
td.start();
Hits h = searcher.search(qd.q);
//System.out.println("Hits Size: " + h.length() + " Query: " + qd.q);
td.stop();
trd.addData(td);
td = new TimeData(qd.id + "-trav");
if (h != null && h.length() > 0)
{
for (int m = 0; m < h.length(); m++)
{
td.start();
int id = h.id(m);
if (qd.retrieve)
{
doc = ir.document(id);
}
td.stop();
}
}
trd.addData(td);
}
try
{
if (searcher != null)
{
searcher.close();
}
}
catch (Exception e)
{
}
;
try
{
if (ir != null)
{
ir.close();
}
}
catch (Exception e)
{
}
;
}
trd.endRun();
params.getRunData().add(trd);
//System.out.println(params[i].showRunData(params[i].getId()));
//params.showRunData(params.getId());
}
System.out.println("End Time: " + new Date());
}
/**
* Parse the Reuters SGML and index:
* Date, Title, Dateline, Body
*
*
*
* @param in input file
* @return Lucene document
*/
protected Document makeDocument(File in, String[] tags, boolean stored, boolean tokenized, boolean tfv)
throws Exception
{
Document doc = new Document();
// tag this document
if (tags != null)
{
for (int i = 0; i < tags.length; i++)
{
doc.add(new Field("tag" + i, tags[i], stored == true ? Field.Store.YES : Field.Store.NO,
tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO));
}
}
doc.add(new Field("file", in.getCanonicalPath(), stored == true ? Field.Store.YES : Field.Store.NO,
tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO));
BufferedReader reader = new BufferedReader(new FileReader(in));
String line = null;
//First line is the date, 3rd is the title, rest is body
String dateStr = reader.readLine();
reader.readLine();//skip an empty line
String title = reader.readLine();
reader.readLine();//skip an empty line
StringBuffer body = new StringBuffer(1024);
while ((line = reader.readLine()) != null)
{
body.append(line).append(' ');
}
Date date = format.parse(dateStr.trim());
doc.add(new Field("date", DateTools.dateToString(date, DateTools.Resolution.SECOND), Field.Store.YES, Field.Index.UN_TOKENIZED));
if (title != null)
{
doc.add(new Field("title", title, stored == true ? Field.Store.YES : Field.Store.NO,
tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO));
}
if (body.length() > 0)
{
doc.add(new Field("body", body.toString(), stored == true ? Field.Store.YES : Field.Store.NO,
tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO));
}
return doc;
}
/**
* Make index, and collect time data.
*
* @param trd run data to populate
* @param srcDir directory with source files
* @param iw index writer, already open
* @param stored store values of fields
* @param tokenized tokenize fields
* @param tfv store term vectors
* @throws Exception
*/
protected void makeIndex(TestRunData trd, File srcDir, IndexWriter iw, boolean stored, boolean tokenized,
boolean tfv, StandardOptions options) throws Exception
{
//File[] groups = srcDir.listFiles();
List files = new ArrayList();
getAllFiles(srcDir, null, files);
Document doc = null;
long cnt = 0L;
TimeData td = new TimeData();
td.name = "addDocument";
int scaleUp = options.getScaleUp();
int logStep = options.getLogStep();
int max = Math.min(files.size(), options.getMaximumDocumentsToIndex());
for (int s = 0; s < scaleUp; s++)
{
String[] tags = new String[]{srcDir.getName() + "/" + s};
int i = 0;
for (Iterator iterator = files.iterator(); iterator.hasNext() && i < max; i++)
{
File file = (File) iterator.next();
doc = makeDocument(file, tags, stored, tokenized, tfv);
td.start();
iw.addDocument(doc);
td.stop();
cnt++;
if (cnt % logStep == 0)
{
System.err.println(" - processed " + cnt + ", run id=" + trd.getId());
trd.addData(td);
td.reset();
}
}
}
trd.addData(td);
}
public static void getAllFiles(File srcDir, FileFilter filter, List allFiles)
{
File [] files = srcDir.listFiles(filter);
for (int i = 0; i < files.length; i++)
{
File file = files[i];
if (file.isDirectory())
{
getAllFiles(file, filter, allFiles);
}
else
{
allFiles.add(file);
}
}
}
/**
* Parse the strings containing Lucene queries.
*
* @param qs array of strings containing query expressions
* @param a analyzer to use when parsing queries
* @return array of Lucene queries
*/
public static Query[] createQueries(List qs, Analyzer a)
{
QueryParser qp = new QueryParser("body", a);
List queries = new ArrayList();
for (int i = 0; i < qs.size(); i++)
{
try
{
Object query = qs.get(i);
Query q = null;
if (query instanceof String)
{
q = qp.parse((String) query);
}
else if (query instanceof Query)
{
q = (Query) query;
}
else
{
System.err.println("Unsupported Query Type: " + query);
}
if (q != null)
{
queries.add(q);
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
return (Query[]) queries.toArray(new Query[0]);
}
/**
* Remove existing index.
*
* @throws Exception
*/
protected void reset(File indexDir) throws Exception
{
if (indexDir.exists())
{
fullyDelete(indexDir);
}
indexDir.mkdirs();
}
/**
* Save a stream to a file.
*
* @param is input stream
* @param out output file
* @param closeInput if true, close the input stream when done.
* @throws Exception
*/
protected void saveStream(InputStream is, File out, boolean closeInput) throws Exception
{
byte[] buf = new byte[4096];
FileOutputStream fos = new FileOutputStream(out);
int len = 0;
long total = 0L;
long time = System.currentTimeMillis();
long delta = time;
while ((len = is.read(buf)) > 0)
{
fos.write(buf, 0, len);
total += len;
time = System.currentTimeMillis();
if (time - delta > 5000)
{
System.err.println(" - copied " + total / 1024 + " kB...");
delta = time;
}
}
fos.flush();
fos.close();
if (closeInput)
{
is.close();
}
}
}

View File

@ -0,0 +1,86 @@
package org.apache.lucene.benchmark.standard;
import java.io.File;
import org.apache.lucene.benchmark.Constants;
import org.apache.lucene.benchmark.BenchmarkOptions;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*
**/
public class StandardOptions implements BenchmarkOptions
{
private int runCount = Constants.DEFAULT_RUN_COUNT;
private int logStep = Constants.DEFAULT_LOG_STEP;
private int scaleUp = Constants.DEFAULT_SCALE_UP;
private int maximumDocumentsToIndex = Constants.DEFAULT_MAXIMUM_DOCUMENTS;
public int getMaximumDocumentsToIndex()
{
return maximumDocumentsToIndex;
}
public void setMaximumDocumentsToIndex(int maximumDocumentsToIndex)
{
this.maximumDocumentsToIndex = maximumDocumentsToIndex;
}
/**
* How often to print out log messages when in benchmark loops
* @return
*/
public int getLogStep()
{
return logStep;
}
public void setLogStep(int logStep)
{
this.logStep = logStep;
}
/**
* The number of times to run the benchmark
* @return
*/
public int getRunCount()
{
return runCount;
}
public void setRunCount(int runCount)
{
this.runCount = runCount;
}
/**
*
* @return
*/
public int getScaleUp()
{
return scaleUp;
}
public void setScaleUp(int scaleUp)
{
this.scaleUp = scaleUp;
}
}

View File

@ -0,0 +1,43 @@
package org.apache.lucene.benchmark.stats;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class holds a set of memory usage values.
*
* @author Andrzej Bialecki &lt;ab@getopt.org&gt;
*/
public class MemUsage {
public long maxFree, minFree, avgFree;
public long maxTotal, minTotal, avgTotal;
public String toString() {
return toScaledString(1, "B");
}
/** Scale down the values by divisor, append the unit string. */
public String toScaledString(int div, String unit) {
StringBuffer sb = new StringBuffer();
sb.append("free=").append(minFree / div);
sb.append("/").append(avgFree / div);
sb.append("/").append(maxFree / div).append(" ").append(unit);
sb.append(", total=").append(minTotal / div);
sb.append("/").append(avgTotal / div);
sb.append("/").append(maxTotal / div).append(" ").append(unit);
return sb.toString();
}
}

View File

@ -0,0 +1,79 @@
package org.apache.lucene.benchmark.stats;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Vector;
import org.apache.lucene.search.Query;
import org.apache.lucene.benchmark.Constants;
/**
* This class holds parameters for a query benchmark.
*
* @author Andrzej Bialecki &lt;ab@getopt.org&gt;
*/
public class QueryData {
/** Benchmark id */
public String id;
/** Lucene query */
public Query q;
/** If true, re-open index reader before benchmark. */
public boolean reopen;
/** If true, warm-up the index reader before searching by sequentially
* retrieving all documents from index.
*/
public boolean warmup;
/**
* If true, actually retrieve documents returned in Hits.
*/
public boolean retrieve;
/**
* Prepare a list of benchmark data, using all possible combinations of
* benchmark parameters.
* @param queries source Lucene queries
* @return The QueryData
*/
public static QueryData[] getAll(Query[] queries) {
Vector vqd = new Vector();
for (int i = 0; i < queries.length; i++) {
for (int r = 1; r >= 0; r--) {
for (int w = 1; w >= 0; w--) {
for (int t = 0; t < 2; t++) {
QueryData qd = new QueryData();
qd.id="qd-" + i + r + w + t;
qd.reopen = Constants.BOOLEANS[r].booleanValue();
qd.warmup = Constants.BOOLEANS[w].booleanValue();
qd.retrieve = Constants.BOOLEANS[t].booleanValue();
qd.q = queries[i];
vqd.add(qd);
}
}
}
}
return (QueryData[])vqd.toArray(new QueryData[0]);
}
/** Short legend for interpreting toString() output. */
public static String getLabels() {
return "# Query data: R-reopen, W-warmup, T-retrieve, N-no";
}
public String toString() {
return id + " " + (reopen ? "R" : "NR") + " " + (warmup ? "W" : "NW") +
" " + (retrieve ? "T" : "NT") + " [" + q.toString() + "]";
}
}

View File

@ -0,0 +1,576 @@
package org.apache.lucene.benchmark.stats;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Vector;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.Constants;
import org.apache.lucene.store.Directory;
/**
* This class holds together all parameters related to a test. Single test is
* performed several times, and all results are averaged.
*
* @author Andrzej Bialecki &lt;ab@getopt.org&gt;
*/
public class TestData
{
public static int[] MAX_BUFFERED_DOCS_COUNTS = new int[]{10, 20, 50, 100, 200, 500};
public static int[] MERGEFACTOR_COUNTS = new int[]{10, 20, 50, 100, 200, 500};
/**
* ID of this test data.
*/
private String id;
/**
* Heap size.
*/
private long heap;
/**
* List of results for each test run with these parameters.
*/
private Vector runData = new Vector();
private int maxBufferedDocs, mergeFactor;
/**
* Directory containing source files.
*/
private File source;
/**
* Lucene Directory implementation for creating an index.
*/
private Directory directory;
/**
* Analyzer to use when adding documents.
*/
private Analyzer analyzer;
/**
* If true, use compound file format.
*/
private boolean compound;
/**
* If true, optimize index when finished adding documents.
*/
private boolean optimize;
/**
* Data for search benchmarks.
*/
private QueryData[] queries;
public TestData()
{
heap = Runtime.getRuntime().maxMemory();
}
private static class DCounter
{
double total;
int count, recordCount;
}
private static class LCounter
{
long total;
int count;
}
private static class LDCounter
{
double Dtotal;
int Dcount, DrecordCount;
long Ltotal0;
int Lcount0;
long Ltotal1;
int Lcount1;
}
/**
* Get a textual summary of the benchmark results, average from all test runs.
*/
static final String ID = "# testData id ";
static final String OP = "operation ";
static final String RUNCNT = " runCnt";
static final String RECCNT = " recCnt";
static final String RECSEC = " rec/s";
static final String FREEMEM = " avgFreeMem";
static final String TOTMEM = " avgTotalMem";
static final String COLS[] = {
ID,
OP,
RUNCNT,
RECCNT,
RECSEC,
FREEMEM,
TOTMEM
};
public String showRunData(String prefix)
{
if (runData.size() == 0)
{
return "# [NO RUN DATA]";
}
HashMap resByTask = new HashMap();
StringBuffer sb = new StringBuffer();
String lineSep = System.getProperty("line.separator");
sb.append("warm = Warm Index Reader").append(lineSep).append("srch = Search Index").append(lineSep).append("trav = Traverse Hits list, optionally retrieving document").append(lineSep).append(lineSep);
for (int i = 0; i < COLS.length; i++) {
sb.append(COLS[i]);
}
sb.append("\n");
LinkedHashMap mapMem = new LinkedHashMap();
LinkedHashMap mapSpeed = new LinkedHashMap();
for (int i = 0; i < runData.size(); i++)
{
TestRunData trd = (TestRunData) runData.get(i);
Collection labels = trd.getLabels();
Iterator it = labels.iterator();
while (it.hasNext())
{
String label = (String) it.next();
MemUsage mem = trd.getMemUsage(label);
if (mem != null)
{
TestData.LCounter[] tm = (TestData.LCounter[]) mapMem.get(label);
if (tm == null)
{
tm = new TestData.LCounter[2];
tm[0] = new TestData.LCounter();
tm[1] = new TestData.LCounter();
mapMem.put(label, tm);
}
tm[0].total += mem.avgFree;
tm[0].count++;
tm[1].total += mem.avgTotal;
tm[1].count++;
}
TimeData td = trd.getTotals(label);
if (td != null)
{
TestData.DCounter dc = (TestData.DCounter) mapSpeed.get(label);
if (dc == null)
{
dc = new TestData.DCounter();
mapSpeed.put(label, dc);
}
dc.count++;
//dc.total += td.getRate();
dc.total += (td.count>0 && td.elapsed<=0 ? 1 : td.elapsed); // assume atleast 1ms for any countable op
dc.recordCount += td.count;
}
}
}
LinkedHashMap res = new LinkedHashMap();
Iterator it = mapSpeed.keySet().iterator();
while (it.hasNext())
{
String label = (String) it.next();
TestData.DCounter dc = (TestData.DCounter) mapSpeed.get(label);
res.put(label,
format(dc.count, RUNCNT) +
format(dc.recordCount / dc.count, RECCNT) +
format(1,(float) (dc.recordCount * 1000.0 / (dc.total>0 ? dc.total : 1.0)), RECSEC)
//format((float) (dc.total / (double) dc.count), RECSEC)
);
// also sum by task
String task = label.substring(label.lastIndexOf("-")+1);
LDCounter ldc = (LDCounter) resByTask.get(task);
if (ldc==null) {
ldc = new LDCounter();
resByTask.put(task,ldc);
}
ldc.Dcount += dc.count;
ldc.DrecordCount += dc.recordCount;
ldc.Dtotal += (dc.count>0 && dc.total<=0 ? 1 : dc.total); // assume atleast 1ms for any countable op
}
it = mapMem.keySet().iterator();
while (it.hasNext())
{
String label = (String) it.next();
TestData.LCounter[] lc = (TestData.LCounter[]) mapMem.get(label);
String speed = (String) res.get(label);
boolean makeSpeed = false;
if (speed == null)
{
makeSpeed = true;
speed =
format(lc[0].count, RUNCNT) +
format(0, RECCNT) +
format(0,(float)0.0, RECSEC);
}
res.put(label, speed +
format(0, lc[0].total / lc[0].count, FREEMEM) +
format(0, lc[1].total / lc[1].count, TOTMEM));
// also sum by task
String task = label.substring(label.lastIndexOf("-")+1);
LDCounter ldc = (LDCounter) resByTask.get(task);
if (ldc==null) {
ldc = new LDCounter();
resByTask.put(task,ldc);
makeSpeed = true;
}
if (makeSpeed) {
ldc.Dcount += lc[0].count;
}
ldc.Lcount0 += lc[0].count;
ldc.Lcount1 += lc[1].count;
ldc.Ltotal0 += lc[0].total;
ldc.Ltotal1 += lc[1].total;
}
it = res.keySet().iterator();
while (it.hasNext())
{
String label = (String) it.next();
sb.append(format(prefix, ID));
sb.append(format(label, OP));
sb.append(res.get(label)).append("\n");
}
// show results by task (srch, optimize, etc.)
sb.append("\n");
for (int i = 0; i < COLS.length; i++) {
sb.append(COLS[i]);
}
sb.append("\n");
it = resByTask.keySet().iterator();
while (it.hasNext())
{
String task = (String) it.next();
LDCounter ldc = (LDCounter) resByTask.get(task);
sb.append(format(" ", ID));
sb.append(format(task, OP));
sb.append(format(ldc.Dcount, RUNCNT));
sb.append(format(ldc.DrecordCount / ldc.Dcount, RECCNT));
sb.append(format(1,(float) (ldc.DrecordCount * 1000.0 / (ldc.Dtotal>0 ? ldc.Dtotal : 1.0)), RECSEC));
sb.append(format(0, ldc.Ltotal0 / ldc.Lcount0, FREEMEM));
sb.append(format(0, ldc.Ltotal1 / ldc.Lcount1, TOTMEM));
sb.append("\n");
}
return sb.toString();
}
private static NumberFormat numFormat [] = { NumberFormat.getInstance(), NumberFormat.getInstance()};
private static final String padd = " ";
static {
numFormat[0].setMaximumFractionDigits(0);
numFormat[0].setMinimumFractionDigits(0);
numFormat[1].setMaximumFractionDigits(1);
numFormat[1].setMinimumFractionDigits(1);
}
// padd number from left
// numFracDigits must be 0 or 1.
static String format(int numFracDigits, float f, String col) {
String res = padd + numFormat[numFracDigits].format(f);
return res.substring(res.length() - col.length());
}
// padd number from left
static String format(int n, String col) {
String res = padd + n;
return res.substring(res.length() - col.length());
}
// padd string from right
static String format(String s, String col) {
return (s + padd).substring(0,col.length());
}
/**
* Prepare a list of benchmark data, using all possible combinations of
* benchmark parameters.
*
* @param sources list of directories containing different source document
* collections
* @param analyzers of analyzers to use.
*/
public static TestData[] getAll(File[] sources, Analyzer[] analyzers)
{
List res = new ArrayList(50);
TestData ref = new TestData();
for (int q = 0; q < analyzers.length; q++)
{
for (int m = 0; m < sources.length; m++)
{
for (int i = 0; i < MAX_BUFFERED_DOCS_COUNTS.length; i++)
{
for (int k = 0; k < MERGEFACTOR_COUNTS.length; k++)
{
for (int n = 0; n < Constants.BOOLEANS.length; n++)
{
for (int p = 0; p < Constants.BOOLEANS.length; p++)
{
ref.id = "td-" + q + m + i + k + n + p;
ref.source = sources[m];
ref.analyzer = analyzers[q];
ref.maxBufferedDocs = MAX_BUFFERED_DOCS_COUNTS[i];
ref.mergeFactor = MERGEFACTOR_COUNTS[k];
ref.compound = Constants.BOOLEANS[n].booleanValue();
ref.optimize = Constants.BOOLEANS[p].booleanValue();
try
{
res.add(ref.clone());
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
}
}
}
}
return (TestData[]) res.toArray(new TestData[0]);
}
/**
* Similar to {@link #getAll(java.io.File[], org.apache.lucene.analysis.Analyzer[])} but only uses
* maxBufferedDocs of 10 and 100 and same for mergeFactor, thus reducing the number of permutations significantly.
* It also only uses compund file and optimize is always true.
*
* @param sources
* @param analyzers
* @return An Array of {@link TestData}
*/
public static TestData[] getTestDataMinMaxMergeAndMaxBuffered(File[] sources, Analyzer[] analyzers)
{
List res = new ArrayList(50);
TestData ref = new TestData();
for (int q = 0; q < analyzers.length; q++)
{
for (int m = 0; m < sources.length; m++)
{
ref.id = "td-" + q + m + "_" + 10 + "_" + 10;
ref.source = sources[m];
ref.analyzer = analyzers[q];
ref.maxBufferedDocs = 10;
ref.mergeFactor = 10;//MERGEFACTOR_COUNTS[k];
ref.compound = true;
ref.optimize = true;
try
{
res.add(ref.clone());
}
catch (Exception e)
{
e.printStackTrace();
}
ref.id = "td-" + q + m + "_" + 10 + "_" + 100;
ref.source = sources[m];
ref.analyzer = analyzers[q];
ref.maxBufferedDocs = 10;
ref.mergeFactor = 100;//MERGEFACTOR_COUNTS[k];
ref.compound = true;
ref.optimize = true;
try
{
res.add(ref.clone());
}
catch (Exception e)
{
e.printStackTrace();
}
ref.id = "td-" + q + m + "_" + 100 + "_" + 10;
ref.source = sources[m];
ref.analyzer = analyzers[q];
ref.maxBufferedDocs = 100;
ref.mergeFactor = 10;//MERGEFACTOR_COUNTS[k];
ref.compound = true;
ref.optimize = true;
try
{
res.add(ref.clone());
}
catch (Exception e)
{
e.printStackTrace();
}
ref.id = "td-" + q + m + "_" + 100 + "_" + 100;
ref.source = sources[m];
ref.analyzer = analyzers[q];
ref.maxBufferedDocs = 100;
ref.mergeFactor = 100;//MERGEFACTOR_COUNTS[k];
ref.compound = true;
ref.optimize = true;
try
{
res.add(ref.clone());
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
return (TestData[]) res.toArray(new TestData[0]);
}
protected Object clone()
{
TestData cl = new TestData();
cl.id = id;
cl.compound = compound;
cl.heap = heap;
cl.mergeFactor = mergeFactor;
cl.maxBufferedDocs = maxBufferedDocs;
cl.optimize = optimize;
cl.source = source;
cl.directory = directory;
cl.analyzer = analyzer;
// don't clone runData
return cl;
}
public String toString()
{
StringBuffer res = new StringBuffer();
res.append("#-- ID: ").append(id).append(", ").append(new Date()).append(", heap=").append(heap).append(" --\n");
res.append("# source=").append(source).append(", directory=").append(directory).append("\n");
res.append("# maxBufferedDocs=").append(maxBufferedDocs).append(", mergeFactor=").append(mergeFactor);
res.append(", compound=").append(compound).append(", optimize=").append(optimize).append("\n");
if (queries != null)
{
res.append(QueryData.getLabels()).append("\n");
for (int i = 0; i < queries.length; i++)
{
res.append("# ").append(queries[i].toString()).append("\n");
}
}
return res.toString();
}
public Analyzer getAnalyzer()
{
return analyzer;
}
public void setAnalyzer(Analyzer analyzer)
{
this.analyzer = analyzer;
}
public boolean isCompound()
{
return compound;
}
public void setCompound(boolean compound)
{
this.compound = compound;
}
public Directory getDirectory()
{
return directory;
}
public void setDirectory(Directory directory)
{
this.directory = directory;
}
public long getHeap()
{
return heap;
}
public void setHeap(long heap)
{
this.heap = heap;
}
public String getId()
{
return id;
}
public void setId(String id)
{
this.id = id;
}
public int getMaxBufferedDocs()
{
return maxBufferedDocs;
}
public void setMaxBufferedDocs(int maxBufferedDocs)
{
this.maxBufferedDocs = maxBufferedDocs;
}
public int getMergeFactor()
{
return mergeFactor;
}
public void setMergeFactor(int mergeFactor)
{
this.mergeFactor = mergeFactor;
}
public boolean isOptimize()
{
return optimize;
}
public void setOptimize(boolean optimize)
{
this.optimize = optimize;
}
public QueryData[] getQueries()
{
return queries;
}
public void setQueries(QueryData[] queries)
{
this.queries = queries;
}
public Vector getRunData()
{
return runData;
}
public void setRunData(Vector runData)
{
this.runData = runData;
}
public File getSource()
{
return source;
}
public void setSource(File source)
{
this.source = source;
}
}

View File

@ -0,0 +1,175 @@
package org.apache.lucene.benchmark.stats;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.LinkedHashMap;
import java.util.Vector;
import java.util.Collection;
import java.util.Iterator;
/**
* This class holds series of TimeData related to a single test run. TimeData
* values may contribute to different measurements, so this class provides also
* some useful methods to separate them.
*
* @author Andrzej Bialecki &lt;ab@getopt.org&gt;
*/
public class TestRunData {
private String id;
/** Start and end time of this test run. */
private long start = 0L, end = 0L;
private LinkedHashMap data = new LinkedHashMap();
public TestRunData() {}
public TestRunData(String id) {
this.id = id;
}
public LinkedHashMap getData()
{
return data;
}
public String getId()
{
return id;
}
public void setId(String id)
{
this.id = id;
}
public long getEnd()
{
return end;
}
public long getStart()
{
return start;
}
/** Mark the starting time of this test run. */
public void startRun() {
start = System.currentTimeMillis();
}
/** Mark the ending time of this test run. */
public void endRun() {
end = System.currentTimeMillis();
}
/** Add a data point. */
public void addData(TimeData td) {
td.recordMemUsage();
Vector v = (Vector) data.get(td.name);
if (v == null) {
v = new Vector();
data.put(td.name, v);
}
v.add(td.clone());
}
/** Get a list of all available types of data points. */
public Collection getLabels() {
return data.keySet();
}
/** Get total values from all data points of a given type. */
public TimeData getTotals(String label) {
Vector v = (Vector) data.get(label);
if (v == null)
{
return null;
}
TimeData res = new TimeData("TOTAL " + label);
for (int i = 0; i < v.size(); i++) {
TimeData td = (TimeData) v.get(i);
res.count += td.count;
res.elapsed += td.elapsed;
}
return res;
}
/** Get total values from all data points of all types.
* @return a list of TimeData values for all types.
*/
public Vector getTotals() {
Collection labels = getLabels();
Vector v = new Vector();
Iterator it = labels.iterator();
while (it.hasNext()) {
TimeData td = getTotals((String) it.next());
v.add(td);
}
return v;
}
/** Get memory usage stats. for a given data type. */
public MemUsage getMemUsage(String label) {
Vector v = (Vector) data.get(label);
if (v == null)
{
return null;
}
MemUsage res = new MemUsage();
res.minFree = Long.MAX_VALUE;
res.minTotal = Long.MAX_VALUE;
long avgFree = 0L, avgTotal = 0L;
for (int i = 0; i < v.size(); i++) {
TimeData td = (TimeData) v.get(i);
if (res.maxFree < td.freeMem)
{
res.maxFree = td.freeMem;
}
if (res.maxTotal < td.totalMem)
{
res.maxTotal = td.totalMem;
}
if (res.minFree > td.freeMem)
{
res.minFree = td.freeMem;
}
if (res.minTotal > td.totalMem)
{
res.minTotal = td.totalMem;
}
avgFree += td.freeMem;
avgTotal += td.totalMem;
}
res.avgFree = avgFree / v.size();
res.avgTotal = avgTotal / v.size();
return res;
}
/** Return a string representation. */
public String toString() {
StringBuffer sb = new StringBuffer();
Collection labels = getLabels();
Iterator it = labels.iterator();
while (it.hasNext()) {
String label = (String) it.next();
sb.append(id + "-" + label + " " + getTotals(label).toString(false) + " ");
sb.append(getMemUsage(label).toScaledString(1024 * 1024, "MB") + "\n");
}
return sb.toString();
}
}

View File

@ -0,0 +1,102 @@
package org.apache.lucene.benchmark.stats;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class holds a data point measuring speed of processing.
*
* @author Andrzej Bialecki &lt;ab@getopt.org&gt;
*/
public class TimeData {
/** Name of the data point - usually one of a data series with the same name */
public String name;
/** Number of records processed. */
public long count = 0;
/** Elapsed time in milliseconds. */
public long elapsed = 0L;
private long delta = 0L;
/** Free memory at the end of measurement interval. */
public long freeMem = 0L;
/** Total memory at the end of measurement interval. */
public long totalMem = 0L;
public TimeData() {};
public TimeData(String name) {
this.name = name;
}
/** Start counting elapsed time. */
public void start() {
delta = System.currentTimeMillis();
}
/** Stop counting elapsed time. */
public void stop() {
count++;
elapsed += (System.currentTimeMillis() - delta);
}
/** Record memory usage. */
public void recordMemUsage() {
freeMem = Runtime.getRuntime().freeMemory();
totalMem = Runtime.getRuntime().totalMemory();
}
/** Reset counters. */
public void reset() {
count = 0;
elapsed = 0L;
delta = elapsed;
}
protected Object clone() {
TimeData td = new TimeData(name);
td.name = name;
td.elapsed = elapsed;
td.count = count;
td.delta = delta;
td.freeMem = freeMem;
td.totalMem = totalMem;
return td;
}
/** Get rate of processing, defined as number of processed records per second. */
public double getRate() {
double rps = (double) count * 1000.0 / (double) (elapsed>0 ? elapsed : 1); // assume atleast 1ms for any countable op
return rps;
}
/** Get a short legend for toString() output. */
public static String getLabels() {
return "# count\telapsed\trec/s\tfreeMem\ttotalMem";
}
public String toString() { return toString(true); }
/**
* Return a tab-seprated string containing this data.
* @param withMem if true, append also memory information
* @return The String
*/
public String toString(boolean withMem) {
StringBuffer sb = new StringBuffer();
sb.append(count + "\t" + elapsed + "\t" + getRate());
if (withMem) sb.append("\t" + freeMem + "\t" + totalMem);
return sb.toString();
}
}

View File

@ -0,0 +1,175 @@
package org.apache.lucene.benchmark.utils;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
*/
public class ExtractReuters
{
private File reutersDir;
private File outputDir;
private static final String LINE_SEPARATOR = System.getProperty("line.separator");
public ExtractReuters(File reutersDir, File outputDir)
{
this.reutersDir = reutersDir;
this.outputDir = outputDir;
System.out.println("Deleting all files in " + outputDir);
File [] files = outputDir.listFiles();
for (int i = 0; i < files.length; i++)
{
files[i].delete();
}
}
public void extract()
{
File [] sgmFiles = reutersDir.listFiles(new FileFilter()
{
public boolean accept(File file)
{
return file.getName().endsWith(".sgm");
}
});
if (sgmFiles != null && sgmFiles.length > 0)
{
for (int i = 0; i < sgmFiles.length; i++)
{
File sgmFile = sgmFiles[i];
extractFile(sgmFile);
}
}
else
{
System.err.println("No .sgm files in " + reutersDir);
}
}
Pattern EXTRACTION_PATTERN = Pattern.compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>");
private static String[] META_CHARS
= {"&", "<", ">", "\"", "'"};
private static String[] META_CHARS_SERIALIZATIONS
= {"&amp;", "&lt;", "&gt;", "&quot;", "&apos;"};
/**
* Override if you wish to change what is extracted
*
* @param sgmFile
*/
protected void extractFile(File sgmFile)
{
try
{
BufferedReader reader = new BufferedReader(new FileReader(sgmFile));
StringBuffer buffer = new StringBuffer(1024);
StringBuffer outBuffer = new StringBuffer(1024);
String line = null;
int index = -1;
int docNumber = 0;
while ((line = reader.readLine()) != null)
{
//when we see a closing reuters tag, flush the file
if ((index = line.indexOf("</REUTERS")) == -1)
{
//Replace the SGM escape sequences
buffer.append(line).append(' ');//accumulate the strings for now, then apply regular expression to get the pieces,
}
else
{
//Extract the relevant pieces and write to a file in the output dir
Matcher matcher = EXTRACTION_PATTERN.matcher(buffer);
while (matcher.find())
{
for (int i = 1; i <= matcher.groupCount(); i++)
{
if (matcher.group(i) != null)
{
outBuffer.append(matcher.group(i));
}
}
outBuffer.append(LINE_SEPARATOR).append(LINE_SEPARATOR);
}
String out = outBuffer.toString();
for (int i = 0; i < META_CHARS_SERIALIZATIONS.length; i++)
{
out = out.replaceAll(META_CHARS_SERIALIZATIONS[i], META_CHARS[i]);
}
File outFile = new File(outputDir, sgmFile.getName() + "-" + (docNumber++) + ".txt");
//System.out.println("Writing " + outFile);
FileWriter writer = new FileWriter(outFile);
writer.write(out);
writer.close();
outBuffer.setLength(0);
buffer.setLength(0);
}
}
}
catch (
IOException e
)
{
throw new RuntimeException(e);
}
}
public static void main(String[] args)
{
if (args.length != 2)
{
printUsage();
}
File reutersDir = new File(args[0]);
if (reutersDir.exists())
{
File outputDir = new File(args[1]);
outputDir.mkdirs();
ExtractReuters extractor = new ExtractReuters(reutersDir, outputDir);
extractor.extract();
}
else
{
printUsage();
}
}
private static void printUsage()
{
System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
}
}

View File

@ -0,0 +1,44 @@
<HTML>
<!--**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
-->
<HEAD>
<TITLE>Lucene Benchmarking Package</TITLE>
</HEAD>
<BODY>
<DIV>
<p/>
The benchmark contribution contains tools for benchmarking Lucene using a standard, freely available corpus. ANT will
download the corpus automatically, place it in a temp directory and then unpack it to the working.dir directory specified in the build.
The temp directory
and working directory can be safely removed after a run. However, the next time the task is run, it will need to download the files again.
<p/>
Classes implementing the Benchmarker interface should have a no-argument constructor if they are to be used with the Driver class. The Driver
class is provided for convenience only. Feel free to implement your own main class for your benchmarker.
<p/>
The StandardBenchmarker is meant to be just that, a standard that runs out of the box with no configuration or changes needed.
Other benchmarking classes may derive from it to provide alternate views or to take in command line options. When reporting benchmarking runs
you should state any alterations you have made.
<p/>
To run the short version of the StandardBenchmarker, call "ant run-micro-standard". This should take a minute or so to complete and give you a preliminary idea of how your change affects the code
<p/>
To run the long version of the StandardBenchmarker, call "ant run-standard". This takes considerably longer.
<p/>
The original code for these classes was donated by Andrzej Bialecki at http://issues.apache.org/jira/browse/LUCENE-675 and has been updated by Grant Ingersoll to make some parts of the code reusable in other benchmarkers
</DIV>
<DIV>&nbsp;</DIV>
</BODY>
</HTML>