diff --git a/contrib/benchmark/conf/micro-standard-config.xml b/contrib/benchmark/conf/micro-standard-config.xml new file mode 100644 index 00000000000..44f67f4187c --- /dev/null +++ b/contrib/benchmark/conf/micro-standard-config.xml @@ -0,0 +1,19 @@ + + + + + + + \ No newline at end of file diff --git a/contrib/benchmark/conf/standard-config.xml b/contrib/benchmark/conf/standard-config.xml new file mode 100644 index 00000000000..7f77d856877 --- /dev/null +++ b/contrib/benchmark/conf/standard-config.xml @@ -0,0 +1,19 @@ + + + + + + + \ No newline at end of file diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/AbstractBenchmarker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/AbstractBenchmarker.java new file mode 100644 index 00000000000..80217304de7 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/AbstractBenchmarker.java @@ -0,0 +1,61 @@ +package org.apache.lucene.benchmark; + +import java.io.File; +import java.io.IOException; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * + * + **/ +public abstract class AbstractBenchmarker implements Benchmarker +{ + /** + * Delete files and directories, even if non-empty. + * + * @param dir file or directory + * @return true on success, false if no or part of files have been deleted + * @throws java.io.IOException + */ + public static boolean fullyDelete(File dir) throws IOException + { + if (dir == null || !dir.exists()) return false; + File contents[] = dir.listFiles(); + if (contents != null) + { + for (int i = 0; i < contents.length; i++) + { + if (contents[i].isFile()) + { + if (!contents[i].delete()) + { + return false; + } + } + else + { + if (!fullyDelete(contents[i])) + { + return false; + } + } + } + } + return dir.delete(); + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/BenchmarkOptions.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/BenchmarkOptions.java new file mode 100644 index 00000000000..e506e0a0cde --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/BenchmarkOptions.java @@ -0,0 +1,29 @@ +package org.apache.lucene.benchmark; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Marker Interface defining some common options. Implementations should define their own set of options that can be + * cast to in the {@link Benchmarker} interface. + *

+ * As benchmarks are added, perhaps a common set of Options will become clear + * + * + **/ +public interface BenchmarkOptions +{ +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/Benchmarker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/Benchmarker.java new file mode 100644 index 00000000000..76409ced6f2 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/Benchmarker.java @@ -0,0 +1,39 @@ +package org.apache.lucene.benchmark; + +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.File; + +import org.apache.lucene.benchmark.stats.TestData; + + +/** + * + * + **/ +public interface Benchmarker +{ + /** + * Benchmark according to the implementation, using the workingDir as the place to store things. + * + * @param workingDir The {@link java.io.File} directory to store temporary data in for running the benchmark + * @param options Any {@link BenchmarkOptions} that are needed for this benchmark. This + * @return The {@link org.apache.lucene.benchmark.stats.TestData} used to run the benchmark. + */ + TestData[] benchmark(File workingDir, BenchmarkOptions options) throws Exception; +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/Constants.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/Constants.java new file mode 100644 index 00000000000..42926321c8f --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/Constants.java @@ -0,0 +1,33 @@ +package org.apache.lucene.benchmark; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +/** + * + * + **/ +public class Constants +{ + public static final int DEFAULT_RUN_COUNT = 5; + public static final int DEFAULT_SCALE_UP = 5; + public static final int DEFAULT_LOG_STEP = 1000; + + public static Boolean[] BOOLEANS = new Boolean[] { Boolean.FALSE, Boolean.TRUE }; + + public static final int DEFAULT_MAXIMUM_DOCUMENTS = Integer.MAX_VALUE; +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/Driver.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/Driver.java new file mode 100644 index 00000000000..5b311ae103b --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/Driver.java @@ -0,0 +1,145 @@ +package org.apache.lucene.benchmark; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; + +import org.apache.commons.digester.Digester; +import org.apache.lucene.benchmark.standard.StandardBenchmarker; +import org.apache.lucene.benchmark.stats.TestData; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Sets up the + * + **/ +public class Driver +{ + private File workingDir; + private Benchmarker benchmarker; + private BenchmarkOptions options; + + public Driver() + { + } + + public Driver(Benchmarker benchmarker, BenchmarkOptions options) + { + this.benchmarker = benchmarker; + this.options = options; + } + + + /** + * Creates a Driver using Digester + * @param inputSource + */ + public Driver(File workingDir, InputSource inputSource) throws IOException, SAXException + { + Digester digester = new Digester(); + digester.setValidating(false); + digester.addObjectCreate("benchmark/benchmarker", "class", StandardBenchmarker.class); + digester.addSetProperties("benchmark/benchmarker"); + digester.addSetNext("benchmark/benchmarker", "setBenchmarker"); + digester.addObjectCreate("benchmark/options", "class", BenchmarkOptions.class); + digester.addSetProperties("benchmark/options"); + digester.addSetNext("benchmark/options", "setOptions"); + digester.push(this); + digester.parse(inputSource); + this.workingDir = workingDir; + } + + private void run() throws Exception + { + TestData [] data = benchmarker.benchmark(workingDir, options); + //Print out summary: + /*System.out.println("Test Data:"); + for (int i = 0; i < data.length; i++) + { + TestData testData = data[i]; + System.out.println("---------------"); + System.out.println(testData.showRunData(testData.getId())); + System.out.println("---------------"); + }*/ + + } + + public Benchmarker getBenchmarker() + { + return benchmarker; + } + + public void setBenchmarker(Benchmarker benchmarker) + { + this.benchmarker = benchmarker; + } + + public BenchmarkOptions getOptions() + { + return options; + } + + public void setOptions(BenchmarkOptions options) + { + this.options = options; + } + + public File getWorkingDir() + { + return workingDir; + } + + public void setWorkingDir(File workingDir) + { + this.workingDir = workingDir; + } + + public static void main(String[] args) + { + + if (args.length != 2) + { + printHelp(args); + System.exit(0); + } + File workingDir = new File(args[0]); + File configFile = new File(args[1]); + if (configFile.exists()) + { + //Setup + try + { + Driver driver = new Driver(workingDir, new InputSource(new FileReader(configFile))); + driver.run(); + } + catch (Exception e) + { + e.printStackTrace(System.err); + } + } + + } + + + private static void printHelp(String[] args) + { + System.out.println("Usage: java -cp [...] " + Driver.class.getName() + " "); + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/ReutersQueries.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/ReutersQueries.java new file mode 100644 index 00000000000..f1102deaf57 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/ReutersQueries.java @@ -0,0 +1,59 @@ +package org.apache.lucene.benchmark.standard; + +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.SpanFirstQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.index.Term; + +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * + * + **/ +public class ReutersQueries +{ + public static String [] STANDARD_QUERIES = { + //Start with some short queries + "Salomon", "Comex", "night trading", "Japan Sony", + //Try some Phrase Queries + "\"Sony Japan\"", "\"food needs\"~3", + "\"World Bank\"^2 AND Nigeria", "\"World Bank\" -Nigeria", + "\"Ford Credit\"~5", + //Try some longer queries + "airline Europe Canada destination", + "Long term pressure by trade " + + "ministers is necessary if the current Uruguay round of talks on " + + "the General Agreement on Trade and Tariffs (GATT) is to " + + "succeed" + }; + + public static Query[] getPrebuiltQueries(String field) + { + //be wary of unanalyzed text + return new Query[]{ + new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 5), + new SpanNearQuery(new SpanQuery[]{new SpanTermQuery(new Term(field, "night")), new SpanTermQuery(new Term(field, "trading"))}, 4, false), + new SpanNearQuery(new SpanQuery[]{new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 10), new SpanTermQuery(new Term(field, "credit"))}, 10, false), + new WildcardQuery(new Term(field, "fo*")), + }; + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardBenchmarker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardBenchmarker.java new file mode 100644 index 00000000000..18d2f3d5cf8 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardBenchmarker.java @@ -0,0 +1,460 @@ +package org.apache.lucene.benchmark.standard; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileFilter; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.InputStream; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; +import java.util.List; +import java.util.Arrays; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.benchmark.AbstractBenchmarker; +import org.apache.lucene.benchmark.BenchmarkOptions; +import org.apache.lucene.benchmark.Benchmarker; +import org.apache.lucene.benchmark.stats.QueryData; +import org.apache.lucene.benchmark.stats.TestData; +import org.apache.lucene.benchmark.stats.TestRunData; +import org.apache.lucene.benchmark.stats.TimeData; +import org.apache.lucene.document.DateTools; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.store.FSDirectory; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Reads in the Reuters Collection, downloaded from http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz + * in the workingDir/reuters and indexes them using the {@link org.apache.lucene.analysis.standard.StandardAnalyzer} + *

+ * Runs a standard set of documents through an Indexer and then runs a standard set of queries against the index. + * + * @see org.apache.lucene.benchmark.standard.StandardBenchmarker#benchmark(java.io.File, org.apache.lucene.benchmark.BenchmarkOptions) + * + * + **/ +public class StandardBenchmarker extends AbstractBenchmarker implements Benchmarker +{ + public static final String SOURCE_DIR = "reuters-out"; + + public static final String INDEX_DIR = "index"; + //30-MAR-1987 14:22:36.87 + private static DateFormat format = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS"); + //DateFormat.getDateTimeInstance(DateFormat.MEDIUM, DateFormat.SHORT); + static{ + format.setLenient(true); + } + + + public StandardBenchmarker() + { + } + + public TestData [] benchmark(File workingDir, BenchmarkOptions opts) throws Exception + { + StandardOptions options = (StandardOptions) opts; + workingDir.mkdirs(); + File sourceDir = getSourceDirectory(workingDir); + + sourceDir.mkdirs(); + File indexDir = new File(workingDir, INDEX_DIR); + indexDir.mkdirs(); + Analyzer a = new StandardAnalyzer(); + List queryList = new ArrayList(20); + queryList.addAll(Arrays.asList(ReutersQueries.STANDARD_QUERIES)); + queryList.addAll(Arrays.asList(ReutersQueries.getPrebuiltQueries("body"))); + Query[] qs = createQueries(queryList, a); + // Here you can limit the set of query benchmarks + QueryData[] qds = QueryData.getAll(qs); + // Here you can narrow down the set of test parameters + TestData[] params = TestData.getTestDataMinMaxMergeAndMaxBuffered(new File[]{sourceDir/*, jumboDir*/}, new Analyzer[]{a});//TestData.getAll(new File[]{sourceDir, jumboDir}, new Analyzer[]{a}); + System.out.println("Testing " + params.length + " different permutations."); + for (int i = 0; i < params.length; i++) + { + try + { + reset(indexDir); + params[i].setDirectory(FSDirectory.getDirectory(indexDir, true)); + params[i].setQueries(qds); + System.out.println(params[i]); + runBenchmark(params[i], options); + // Here you can collect and output the runData for further processing. + System.out.println(params[i].showRunData(params[i].getId())); + //bench.runSearchBenchmark(queries, dir); + params[i].getDirectory().close(); + System.runFinalization(); + System.gc(); + } + catch (Exception e) + { + e.printStackTrace(); + System.out.println("EXCEPTION: " + e.getMessage()); + //break; + } + } + return params; + } + + protected File getSourceDirectory(File workingDir) + { + return new File(workingDir, SOURCE_DIR); + } + + /** + * Run benchmark using supplied parameters. + * + * @param params benchmark parameters + * @throws Exception + */ + protected void runBenchmark(TestData params, StandardOptions options) throws Exception + { + System.out.println("Start Time: " + new Date()); + int runCount = options.getRunCount(); + for (int i = 0; i < runCount; i++) + { + TestRunData trd = new TestRunData(); + trd.startRun(); + trd.setId(String.valueOf(i)); + IndexWriter iw = new IndexWriter(params.getDirectory(), params.getAnalyzer(), true); + iw.setMergeFactor(params.getMergeFactor()); + iw.setMaxBufferedDocs(params.getMaxBufferedDocs()); + + iw.setUseCompoundFile(params.isCompound()); + makeIndex(trd, params.getSource(), iw, true, true, false, options); + if (params.isOptimize()) + { + TimeData td = new TimeData("optimize"); + trd.addData(td); + td.start(); + iw.optimize(); + td.stop(); + trd.addData(td); + } + iw.close(); + QueryData[] queries = params.getQueries(); + if (queries != null) + { + IndexReader ir = null; + IndexSearcher searcher = null; + for (int k = 0; k < queries.length; k++) + { + QueryData qd = queries[k]; + if (ir != null && qd.reopen) + { + searcher.close(); + ir.close(); + ir = null; + searcher = null; + } + if (ir == null) + { + ir = IndexReader.open(params.getDirectory()); + searcher = new IndexSearcher(ir); + } + Document doc = null; + if (qd.warmup) + { + TimeData td = new TimeData(qd.id + "-warm"); + for (int m = 0; m < ir.maxDoc(); m++) + { + td.start(); + if (ir.isDeleted(m)) + { + td.stop(); + continue; + } + doc = ir.document(m); + td.stop(); + } + trd.addData(td); + } + TimeData td = new TimeData(qd.id + "-srch"); + td.start(); + Hits h = searcher.search(qd.q); + //System.out.println("Hits Size: " + h.length() + " Query: " + qd.q); + td.stop(); + trd.addData(td); + td = new TimeData(qd.id + "-trav"); + if (h != null && h.length() > 0) + { + for (int m = 0; m < h.length(); m++) + { + td.start(); + int id = h.id(m); + if (qd.retrieve) + { + doc = ir.document(id); + } + td.stop(); + } + } + trd.addData(td); + } + try + { + if (searcher != null) + { + searcher.close(); + } + } + catch (Exception e) + { + } + ; + try + { + if (ir != null) + { + ir.close(); + } + } + catch (Exception e) + { + } + ; + } + trd.endRun(); + params.getRunData().add(trd); + //System.out.println(params[i].showRunData(params[i].getId())); + //params.showRunData(params.getId()); + } + System.out.println("End Time: " + new Date()); + } + + /** + * Parse the Reuters SGML and index: + * Date, Title, Dateline, Body + * + * + * + * @param in input file + * @return Lucene document + */ + protected Document makeDocument(File in, String[] tags, boolean stored, boolean tokenized, boolean tfv) + throws Exception + { + Document doc = new Document(); + // tag this document + if (tags != null) + { + for (int i = 0; i < tags.length; i++) + { + doc.add(new Field("tag" + i, tags[i], stored == true ? Field.Store.YES : Field.Store.NO, + tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO)); + } + } + doc.add(new Field("file", in.getCanonicalPath(), stored == true ? Field.Store.YES : Field.Store.NO, + tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO)); + BufferedReader reader = new BufferedReader(new FileReader(in)); + String line = null; + //First line is the date, 3rd is the title, rest is body + String dateStr = reader.readLine(); + reader.readLine();//skip an empty line + String title = reader.readLine(); + reader.readLine();//skip an empty line + StringBuffer body = new StringBuffer(1024); + while ((line = reader.readLine()) != null) + { + body.append(line).append(' '); + } + Date date = format.parse(dateStr.trim()); + + doc.add(new Field("date", DateTools.dateToString(date, DateTools.Resolution.SECOND), Field.Store.YES, Field.Index.UN_TOKENIZED)); + + if (title != null) + { + doc.add(new Field("title", title, stored == true ? Field.Store.YES : Field.Store.NO, + tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO)); + } + if (body.length() > 0) + { + doc.add(new Field("body", body.toString(), stored == true ? Field.Store.YES : Field.Store.NO, + tokenized == true ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED, tfv == true ? Field.TermVector.YES : Field.TermVector.NO)); + } + + return doc; + } + + /** + * Make index, and collect time data. + * + * @param trd run data to populate + * @param srcDir directory with source files + * @param iw index writer, already open + * @param stored store values of fields + * @param tokenized tokenize fields + * @param tfv store term vectors + * @throws Exception + */ + protected void makeIndex(TestRunData trd, File srcDir, IndexWriter iw, boolean stored, boolean tokenized, + boolean tfv, StandardOptions options) throws Exception + { + //File[] groups = srcDir.listFiles(); + List files = new ArrayList(); + getAllFiles(srcDir, null, files); + Document doc = null; + long cnt = 0L; + TimeData td = new TimeData(); + td.name = "addDocument"; + int scaleUp = options.getScaleUp(); + int logStep = options.getLogStep(); + int max = Math.min(files.size(), options.getMaximumDocumentsToIndex()); + for (int s = 0; s < scaleUp; s++) + { + String[] tags = new String[]{srcDir.getName() + "/" + s}; + int i = 0; + for (Iterator iterator = files.iterator(); iterator.hasNext() && i < max; i++) + { + File file = (File) iterator.next(); + doc = makeDocument(file, tags, stored, tokenized, tfv); + td.start(); + iw.addDocument(doc); + td.stop(); + cnt++; + if (cnt % logStep == 0) + { + System.err.println(" - processed " + cnt + ", run id=" + trd.getId()); + trd.addData(td); + td.reset(); + } + } + } + trd.addData(td); + } + + public static void getAllFiles(File srcDir, FileFilter filter, List allFiles) + { + File [] files = srcDir.listFiles(filter); + for (int i = 0; i < files.length; i++) + { + File file = files[i]; + if (file.isDirectory()) + { + getAllFiles(file, filter, allFiles); + } + else + { + allFiles.add(file); + } + } + } + + /** + * Parse the strings containing Lucene queries. + * + * @param qs array of strings containing query expressions + * @param a analyzer to use when parsing queries + * @return array of Lucene queries + */ + public static Query[] createQueries(List qs, Analyzer a) + { + QueryParser qp = new QueryParser("body", a); + List queries = new ArrayList(); + for (int i = 0; i < qs.size(); i++) + { + try + { + Object query = qs.get(i); + Query q = null; + if (query instanceof String) + { + q = qp.parse((String) query); + } + else if (query instanceof Query) + { + q = (Query) query; + } + else + { + System.err.println("Unsupported Query Type: " + query); + } + if (q != null) + { + queries.add(q); + } + + } + catch (Exception e) + { + e.printStackTrace(); + } + } + return (Query[]) queries.toArray(new Query[0]); + } + + /** + * Remove existing index. + * + * @throws Exception + */ + protected void reset(File indexDir) throws Exception + { + if (indexDir.exists()) + { + fullyDelete(indexDir); + } + indexDir.mkdirs(); + } + + /** + * Save a stream to a file. + * + * @param is input stream + * @param out output file + * @param closeInput if true, close the input stream when done. + * @throws Exception + */ + protected void saveStream(InputStream is, File out, boolean closeInput) throws Exception + { + byte[] buf = new byte[4096]; + FileOutputStream fos = new FileOutputStream(out); + int len = 0; + long total = 0L; + long time = System.currentTimeMillis(); + long delta = time; + while ((len = is.read(buf)) > 0) + { + fos.write(buf, 0, len); + total += len; + time = System.currentTimeMillis(); + if (time - delta > 5000) + { + System.err.println(" - copied " + total / 1024 + " kB..."); + delta = time; + } + } + fos.flush(); + fos.close(); + if (closeInput) + { + is.close(); + } + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardOptions.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardOptions.java new file mode 100644 index 00000000000..0b61f54931e --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardOptions.java @@ -0,0 +1,86 @@ +package org.apache.lucene.benchmark.standard; + +import java.io.File; + +import org.apache.lucene.benchmark.Constants; +import org.apache.lucene.benchmark.BenchmarkOptions; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + * + **/ +public class StandardOptions implements BenchmarkOptions +{ + private int runCount = Constants.DEFAULT_RUN_COUNT; + private int logStep = Constants.DEFAULT_LOG_STEP; + private int scaleUp = Constants.DEFAULT_SCALE_UP; + private int maximumDocumentsToIndex = Constants.DEFAULT_MAXIMUM_DOCUMENTS; + + + public int getMaximumDocumentsToIndex() + { + return maximumDocumentsToIndex; + } + + public void setMaximumDocumentsToIndex(int maximumDocumentsToIndex) + { + this.maximumDocumentsToIndex = maximumDocumentsToIndex; + } + + /** + * How often to print out log messages when in benchmark loops + * @return + */ + public int getLogStep() + { + return logStep; + } + + public void setLogStep(int logStep) + { + this.logStep = logStep; + } + + /** + * The number of times to run the benchmark + * @return + */ + public int getRunCount() + { + return runCount; + } + + public void setRunCount(int runCount) + { + this.runCount = runCount; + } + + /** + * + * @return + */ + public int getScaleUp() + { + return scaleUp; + } + + public void setScaleUp(int scaleUp) + { + this.scaleUp = scaleUp; + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/MemUsage.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/MemUsage.java new file mode 100644 index 00000000000..8d91cfc21d5 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/MemUsage.java @@ -0,0 +1,43 @@ +package org.apache.lucene.benchmark.stats; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This class holds a set of memory usage values. + * + * @author Andrzej Bialecki <ab@getopt.org> + */ +public class MemUsage { + public long maxFree, minFree, avgFree; + + public long maxTotal, minTotal, avgTotal; + + public String toString() { + return toScaledString(1, "B"); + } + + /** Scale down the values by divisor, append the unit string. */ + public String toScaledString(int div, String unit) { + StringBuffer sb = new StringBuffer(); + sb.append("free=").append(minFree / div); + sb.append("/").append(avgFree / div); + sb.append("/").append(maxFree / div).append(" ").append(unit); + sb.append(", total=").append(minTotal / div); + sb.append("/").append(avgTotal / div); + sb.append("/").append(maxTotal / div).append(" ").append(unit); + return sb.toString(); + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/QueryData.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/QueryData.java new file mode 100644 index 00000000000..d44bb50357a --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/QueryData.java @@ -0,0 +1,79 @@ +package org.apache.lucene.benchmark.stats; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Vector; + +import org.apache.lucene.search.Query; +import org.apache.lucene.benchmark.Constants; + +/** + * This class holds parameters for a query benchmark. + * + * @author Andrzej Bialecki <ab@getopt.org> + */ +public class QueryData { + /** Benchmark id */ + public String id; + /** Lucene query */ + public Query q; + /** If true, re-open index reader before benchmark. */ + public boolean reopen; + /** If true, warm-up the index reader before searching by sequentially + * retrieving all documents from index. + */ + public boolean warmup; + /** + * If true, actually retrieve documents returned in Hits. + */ + public boolean retrieve; + + /** + * Prepare a list of benchmark data, using all possible combinations of + * benchmark parameters. + * @param queries source Lucene queries + * @return The QueryData + */ + public static QueryData[] getAll(Query[] queries) { + Vector vqd = new Vector(); + for (int i = 0; i < queries.length; i++) { + for (int r = 1; r >= 0; r--) { + for (int w = 1; w >= 0; w--) { + for (int t = 0; t < 2; t++) { + QueryData qd = new QueryData(); + qd.id="qd-" + i + r + w + t; + qd.reopen = Constants.BOOLEANS[r].booleanValue(); + qd.warmup = Constants.BOOLEANS[w].booleanValue(); + qd.retrieve = Constants.BOOLEANS[t].booleanValue(); + qd.q = queries[i]; + vqd.add(qd); + } + } + } + } + return (QueryData[])vqd.toArray(new QueryData[0]); + } + + /** Short legend for interpreting toString() output. */ + public static String getLabels() { + return "# Query data: R-reopen, W-warmup, T-retrieve, N-no"; + } + + public String toString() { + return id + " " + (reopen ? "R" : "NR") + " " + (warmup ? "W" : "NW") + + " " + (retrieve ? "T" : "NT") + " [" + q.toString() + "]"; + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/TestData.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/TestData.java new file mode 100644 index 00000000000..628aa39a875 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/TestData.java @@ -0,0 +1,576 @@ +package org.apache.lucene.benchmark.stats; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.File; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Vector; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.benchmark.Constants; +import org.apache.lucene.store.Directory; + + +/** + * This class holds together all parameters related to a test. Single test is + * performed several times, and all results are averaged. + * + * @author Andrzej Bialecki <ab@getopt.org> + */ +public class TestData +{ + public static int[] MAX_BUFFERED_DOCS_COUNTS = new int[]{10, 20, 50, 100, 200, 500}; + public static int[] MERGEFACTOR_COUNTS = new int[]{10, 20, 50, 100, 200, 500}; + + /** + * ID of this test data. + */ + private String id; + /** + * Heap size. + */ + private long heap; + /** + * List of results for each test run with these parameters. + */ + private Vector runData = new Vector(); + private int maxBufferedDocs, mergeFactor; + /** + * Directory containing source files. + */ + private File source; + /** + * Lucene Directory implementation for creating an index. + */ + private Directory directory; + /** + * Analyzer to use when adding documents. + */ + private Analyzer analyzer; + /** + * If true, use compound file format. + */ + private boolean compound; + /** + * If true, optimize index when finished adding documents. + */ + private boolean optimize; + /** + * Data for search benchmarks. + */ + private QueryData[] queries; + + public TestData() + { + heap = Runtime.getRuntime().maxMemory(); + } + + private static class DCounter + { + double total; + int count, recordCount; + } + + private static class LCounter + { + long total; + int count; + } + + private static class LDCounter + { + double Dtotal; + int Dcount, DrecordCount; + long Ltotal0; + int Lcount0; + long Ltotal1; + int Lcount1; + } + + /** + * Get a textual summary of the benchmark results, average from all test runs. + */ + static final String ID = "# testData id "; + static final String OP = "operation "; + static final String RUNCNT = " runCnt"; + static final String RECCNT = " recCnt"; + static final String RECSEC = " rec/s"; + static final String FREEMEM = " avgFreeMem"; + static final String TOTMEM = " avgTotalMem"; + static final String COLS[] = { + ID, + OP, + RUNCNT, + RECCNT, + RECSEC, + FREEMEM, + TOTMEM + }; + public String showRunData(String prefix) + { + if (runData.size() == 0) + { + return "# [NO RUN DATA]"; + } + HashMap resByTask = new HashMap(); + StringBuffer sb = new StringBuffer(); + String lineSep = System.getProperty("line.separator"); + sb.append("warm = Warm Index Reader").append(lineSep).append("srch = Search Index").append(lineSep).append("trav = Traverse Hits list, optionally retrieving document").append(lineSep).append(lineSep); + for (int i = 0; i < COLS.length; i++) { + sb.append(COLS[i]); + } + sb.append("\n"); + LinkedHashMap mapMem = new LinkedHashMap(); + LinkedHashMap mapSpeed = new LinkedHashMap(); + for (int i = 0; i < runData.size(); i++) + { + TestRunData trd = (TestRunData) runData.get(i); + Collection labels = trd.getLabels(); + Iterator it = labels.iterator(); + while (it.hasNext()) + { + String label = (String) it.next(); + MemUsage mem = trd.getMemUsage(label); + if (mem != null) + { + TestData.LCounter[] tm = (TestData.LCounter[]) mapMem.get(label); + if (tm == null) + { + tm = new TestData.LCounter[2]; + tm[0] = new TestData.LCounter(); + tm[1] = new TestData.LCounter(); + mapMem.put(label, tm); + } + tm[0].total += mem.avgFree; + tm[0].count++; + tm[1].total += mem.avgTotal; + tm[1].count++; + } + TimeData td = trd.getTotals(label); + if (td != null) + { + TestData.DCounter dc = (TestData.DCounter) mapSpeed.get(label); + if (dc == null) + { + dc = new TestData.DCounter(); + mapSpeed.put(label, dc); + } + dc.count++; + //dc.total += td.getRate(); + dc.total += (td.count>0 && td.elapsed<=0 ? 1 : td.elapsed); // assume atleast 1ms for any countable op + dc.recordCount += td.count; + } + } + } + LinkedHashMap res = new LinkedHashMap(); + Iterator it = mapSpeed.keySet().iterator(); + while (it.hasNext()) + { + String label = (String) it.next(); + TestData.DCounter dc = (TestData.DCounter) mapSpeed.get(label); + res.put(label, + format(dc.count, RUNCNT) + + format(dc.recordCount / dc.count, RECCNT) + + format(1,(float) (dc.recordCount * 1000.0 / (dc.total>0 ? dc.total : 1.0)), RECSEC) + //format((float) (dc.total / (double) dc.count), RECSEC) + ); + + // also sum by task + String task = label.substring(label.lastIndexOf("-")+1); + LDCounter ldc = (LDCounter) resByTask.get(task); + if (ldc==null) { + ldc = new LDCounter(); + resByTask.put(task,ldc); + } + ldc.Dcount += dc.count; + ldc.DrecordCount += dc.recordCount; + ldc.Dtotal += (dc.count>0 && dc.total<=0 ? 1 : dc.total); // assume atleast 1ms for any countable op + } + it = mapMem.keySet().iterator(); + while (it.hasNext()) + { + String label = (String) it.next(); + TestData.LCounter[] lc = (TestData.LCounter[]) mapMem.get(label); + String speed = (String) res.get(label); + boolean makeSpeed = false; + if (speed == null) + { + makeSpeed = true; + speed = + format(lc[0].count, RUNCNT) + + format(0, RECCNT) + + format(0,(float)0.0, RECSEC); + } + res.put(label, speed + + format(0, lc[0].total / lc[0].count, FREEMEM) + + format(0, lc[1].total / lc[1].count, TOTMEM)); + + // also sum by task + String task = label.substring(label.lastIndexOf("-")+1); + LDCounter ldc = (LDCounter) resByTask.get(task); + if (ldc==null) { + ldc = new LDCounter(); + resByTask.put(task,ldc); + makeSpeed = true; + } + if (makeSpeed) { + ldc.Dcount += lc[0].count; + } + ldc.Lcount0 += lc[0].count; + ldc.Lcount1 += lc[1].count; + ldc.Ltotal0 += lc[0].total; + ldc.Ltotal1 += lc[1].total; + } + it = res.keySet().iterator(); + while (it.hasNext()) + { + String label = (String) it.next(); + sb.append(format(prefix, ID)); + sb.append(format(label, OP)); + sb.append(res.get(label)).append("\n"); + } + // show results by task (srch, optimize, etc.) + sb.append("\n"); + for (int i = 0; i < COLS.length; i++) { + sb.append(COLS[i]); + } + sb.append("\n"); + it = resByTask.keySet().iterator(); + while (it.hasNext()) + { + String task = (String) it.next(); + LDCounter ldc = (LDCounter) resByTask.get(task); + sb.append(format(" ", ID)); + sb.append(format(task, OP)); + sb.append(format(ldc.Dcount, RUNCNT)); + sb.append(format(ldc.DrecordCount / ldc.Dcount, RECCNT)); + sb.append(format(1,(float) (ldc.DrecordCount * 1000.0 / (ldc.Dtotal>0 ? ldc.Dtotal : 1.0)), RECSEC)); + sb.append(format(0, ldc.Ltotal0 / ldc.Lcount0, FREEMEM)); + sb.append(format(0, ldc.Ltotal1 / ldc.Lcount1, TOTMEM)); + sb.append("\n"); + } + return sb.toString(); + } + + private static NumberFormat numFormat [] = { NumberFormat.getInstance(), NumberFormat.getInstance()}; + private static final String padd = " "; + static { + numFormat[0].setMaximumFractionDigits(0); + numFormat[0].setMinimumFractionDigits(0); + numFormat[1].setMaximumFractionDigits(1); + numFormat[1].setMinimumFractionDigits(1); + } + + // padd number from left + // numFracDigits must be 0 or 1. + static String format(int numFracDigits, float f, String col) { + String res = padd + numFormat[numFracDigits].format(f); + return res.substring(res.length() - col.length()); + } + + // padd number from left + static String format(int n, String col) { + String res = padd + n; + return res.substring(res.length() - col.length()); + } + + // padd string from right + static String format(String s, String col) { + return (s + padd).substring(0,col.length()); + } + + /** + * Prepare a list of benchmark data, using all possible combinations of + * benchmark parameters. + * + * @param sources list of directories containing different source document + * collections + * @param analyzers of analyzers to use. + */ + public static TestData[] getAll(File[] sources, Analyzer[] analyzers) + { + List res = new ArrayList(50); + TestData ref = new TestData(); + for (int q = 0; q < analyzers.length; q++) + { + for (int m = 0; m < sources.length; m++) + { + for (int i = 0; i < MAX_BUFFERED_DOCS_COUNTS.length; i++) + { + for (int k = 0; k < MERGEFACTOR_COUNTS.length; k++) + { + for (int n = 0; n < Constants.BOOLEANS.length; n++) + { + for (int p = 0; p < Constants.BOOLEANS.length; p++) + { + ref.id = "td-" + q + m + i + k + n + p; + ref.source = sources[m]; + ref.analyzer = analyzers[q]; + ref.maxBufferedDocs = MAX_BUFFERED_DOCS_COUNTS[i]; + ref.mergeFactor = MERGEFACTOR_COUNTS[k]; + ref.compound = Constants.BOOLEANS[n].booleanValue(); + ref.optimize = Constants.BOOLEANS[p].booleanValue(); + try + { + res.add(ref.clone()); + } + catch (Exception e) + { + e.printStackTrace(); + } + } + } + } + } + } + } + return (TestData[]) res.toArray(new TestData[0]); + } + + /** + * Similar to {@link #getAll(java.io.File[], org.apache.lucene.analysis.Analyzer[])} but only uses + * maxBufferedDocs of 10 and 100 and same for mergeFactor, thus reducing the number of permutations significantly. + * It also only uses compund file and optimize is always true. + * + * @param sources + * @param analyzers + * @return An Array of {@link TestData} + */ + public static TestData[] getTestDataMinMaxMergeAndMaxBuffered(File[] sources, Analyzer[] analyzers) + { + List res = new ArrayList(50); + TestData ref = new TestData(); + for (int q = 0; q < analyzers.length; q++) + { + for (int m = 0; m < sources.length; m++) + { + ref.id = "td-" + q + m + "_" + 10 + "_" + 10; + ref.source = sources[m]; + ref.analyzer = analyzers[q]; + ref.maxBufferedDocs = 10; + ref.mergeFactor = 10;//MERGEFACTOR_COUNTS[k]; + ref.compound = true; + ref.optimize = true; + try + { + res.add(ref.clone()); + } + catch (Exception e) + { + e.printStackTrace(); + } + ref.id = "td-" + q + m + "_" + 10 + "_" + 100; + ref.source = sources[m]; + ref.analyzer = analyzers[q]; + ref.maxBufferedDocs = 10; + ref.mergeFactor = 100;//MERGEFACTOR_COUNTS[k]; + ref.compound = true; + ref.optimize = true; + try + { + res.add(ref.clone()); + } + catch (Exception e) + { + e.printStackTrace(); + } + ref.id = "td-" + q + m + "_" + 100 + "_" + 10; + ref.source = sources[m]; + ref.analyzer = analyzers[q]; + ref.maxBufferedDocs = 100; + ref.mergeFactor = 10;//MERGEFACTOR_COUNTS[k]; + ref.compound = true; + ref.optimize = true; + try + { + res.add(ref.clone()); + } + catch (Exception e) + { + e.printStackTrace(); + } + ref.id = "td-" + q + m + "_" + 100 + "_" + 100; + ref.source = sources[m]; + ref.analyzer = analyzers[q]; + ref.maxBufferedDocs = 100; + ref.mergeFactor = 100;//MERGEFACTOR_COUNTS[k]; + ref.compound = true; + ref.optimize = true; + try + { + res.add(ref.clone()); + } + catch (Exception e) + { + e.printStackTrace(); + } + } + } + return (TestData[]) res.toArray(new TestData[0]); + } + + protected Object clone() + { + TestData cl = new TestData(); + cl.id = id; + cl.compound = compound; + cl.heap = heap; + cl.mergeFactor = mergeFactor; + cl.maxBufferedDocs = maxBufferedDocs; + cl.optimize = optimize; + cl.source = source; + cl.directory = directory; + cl.analyzer = analyzer; + // don't clone runData + return cl; + } + + public String toString() + { + StringBuffer res = new StringBuffer(); + res.append("#-- ID: ").append(id).append(", ").append(new Date()).append(", heap=").append(heap).append(" --\n"); + res.append("# source=").append(source).append(", directory=").append(directory).append("\n"); + res.append("# maxBufferedDocs=").append(maxBufferedDocs).append(", mergeFactor=").append(mergeFactor); + res.append(", compound=").append(compound).append(", optimize=").append(optimize).append("\n"); + if (queries != null) + { + res.append(QueryData.getLabels()).append("\n"); + for (int i = 0; i < queries.length; i++) + { + res.append("# ").append(queries[i].toString()).append("\n"); + } + } + return res.toString(); + } + + public Analyzer getAnalyzer() + { + return analyzer; + } + + public void setAnalyzer(Analyzer analyzer) + { + this.analyzer = analyzer; + } + + public boolean isCompound() + { + return compound; + } + + public void setCompound(boolean compound) + { + this.compound = compound; + } + + public Directory getDirectory() + { + return directory; + } + + public void setDirectory(Directory directory) + { + this.directory = directory; + } + + public long getHeap() + { + return heap; + } + + public void setHeap(long heap) + { + this.heap = heap; + } + + public String getId() + { + return id; + } + + public void setId(String id) + { + this.id = id; + } + + public int getMaxBufferedDocs() + { + return maxBufferedDocs; + } + + public void setMaxBufferedDocs(int maxBufferedDocs) + { + this.maxBufferedDocs = maxBufferedDocs; + } + + public int getMergeFactor() + { + return mergeFactor; + } + + public void setMergeFactor(int mergeFactor) + { + this.mergeFactor = mergeFactor; + } + + public boolean isOptimize() + { + return optimize; + } + + public void setOptimize(boolean optimize) + { + this.optimize = optimize; + } + + public QueryData[] getQueries() + { + return queries; + } + + public void setQueries(QueryData[] queries) + { + this.queries = queries; + } + + public Vector getRunData() + { + return runData; + } + + public void setRunData(Vector runData) + { + this.runData = runData; + } + + public File getSource() + { + return source; + } + + public void setSource(File source) + { + this.source = source; + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/TestRunData.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/TestRunData.java new file mode 100644 index 00000000000..c397824d3b3 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/TestRunData.java @@ -0,0 +1,175 @@ +package org.apache.lucene.benchmark.stats; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.util.LinkedHashMap; +import java.util.Vector; +import java.util.Collection; +import java.util.Iterator; + +/** + * This class holds series of TimeData related to a single test run. TimeData + * values may contribute to different measurements, so this class provides also + * some useful methods to separate them. + * + * @author Andrzej Bialecki <ab@getopt.org> + */ +public class TestRunData { + private String id; + + /** Start and end time of this test run. */ + private long start = 0L, end = 0L; + + private LinkedHashMap data = new LinkedHashMap(); + + public TestRunData() {} + + public TestRunData(String id) { + this.id = id; + } + + public LinkedHashMap getData() + { + return data; + } + + public String getId() + { + return id; + } + + public void setId(String id) + { + this.id = id; + } + + public long getEnd() + { + return end; + } + + public long getStart() + { + return start; + } + + /** Mark the starting time of this test run. */ + public void startRun() { + start = System.currentTimeMillis(); + } + + /** Mark the ending time of this test run. */ + public void endRun() { + end = System.currentTimeMillis(); + } + + /** Add a data point. */ + public void addData(TimeData td) { + td.recordMemUsage(); + Vector v = (Vector) data.get(td.name); + if (v == null) { + v = new Vector(); + data.put(td.name, v); + } + v.add(td.clone()); + } + + /** Get a list of all available types of data points. */ + public Collection getLabels() { + return data.keySet(); + } + + /** Get total values from all data points of a given type. */ + public TimeData getTotals(String label) { + Vector v = (Vector) data.get(label); + if (v == null) + { + return null; + } + TimeData res = new TimeData("TOTAL " + label); + for (int i = 0; i < v.size(); i++) { + TimeData td = (TimeData) v.get(i); + res.count += td.count; + res.elapsed += td.elapsed; + } + return res; + } + + /** Get total values from all data points of all types. + * @return a list of TimeData values for all types. + */ + public Vector getTotals() { + Collection labels = getLabels(); + Vector v = new Vector(); + Iterator it = labels.iterator(); + while (it.hasNext()) { + TimeData td = getTotals((String) it.next()); + v.add(td); + } + return v; + } + + /** Get memory usage stats. for a given data type. */ + public MemUsage getMemUsage(String label) { + Vector v = (Vector) data.get(label); + if (v == null) + { + return null; + } + MemUsage res = new MemUsage(); + res.minFree = Long.MAX_VALUE; + res.minTotal = Long.MAX_VALUE; + long avgFree = 0L, avgTotal = 0L; + for (int i = 0; i < v.size(); i++) { + TimeData td = (TimeData) v.get(i); + if (res.maxFree < td.freeMem) + { + res.maxFree = td.freeMem; + } + if (res.maxTotal < td.totalMem) + { + res.maxTotal = td.totalMem; + } + if (res.minFree > td.freeMem) + { + res.minFree = td.freeMem; + } + if (res.minTotal > td.totalMem) + { + res.minTotal = td.totalMem; + } + avgFree += td.freeMem; + avgTotal += td.totalMem; + } + res.avgFree = avgFree / v.size(); + res.avgTotal = avgTotal / v.size(); + return res; + } + + /** Return a string representation. */ + public String toString() { + StringBuffer sb = new StringBuffer(); + Collection labels = getLabels(); + Iterator it = labels.iterator(); + while (it.hasNext()) { + String label = (String) it.next(); + sb.append(id + "-" + label + " " + getTotals(label).toString(false) + " "); + sb.append(getMemUsage(label).toScaledString(1024 * 1024, "MB") + "\n"); + } + return sb.toString(); + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/TimeData.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/TimeData.java new file mode 100644 index 00000000000..91252fffb70 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/stats/TimeData.java @@ -0,0 +1,102 @@ +package org.apache.lucene.benchmark.stats; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * This class holds a data point measuring speed of processing. + * + * @author Andrzej Bialecki <ab@getopt.org> + */ +public class TimeData { + /** Name of the data point - usually one of a data series with the same name */ + public String name; + /** Number of records processed. */ + public long count = 0; + /** Elapsed time in milliseconds. */ + public long elapsed = 0L; + + private long delta = 0L; + /** Free memory at the end of measurement interval. */ + public long freeMem = 0L; + /** Total memory at the end of measurement interval. */ + public long totalMem = 0L; + + public TimeData() {}; + + public TimeData(String name) { + this.name = name; + } + + /** Start counting elapsed time. */ + public void start() { + delta = System.currentTimeMillis(); + } + + /** Stop counting elapsed time. */ + public void stop() { + count++; + elapsed += (System.currentTimeMillis() - delta); + } + + /** Record memory usage. */ + public void recordMemUsage() { + freeMem = Runtime.getRuntime().freeMemory(); + totalMem = Runtime.getRuntime().totalMemory(); + } + + /** Reset counters. */ + public void reset() { + count = 0; + elapsed = 0L; + delta = elapsed; + } + + protected Object clone() { + TimeData td = new TimeData(name); + td.name = name; + td.elapsed = elapsed; + td.count = count; + td.delta = delta; + td.freeMem = freeMem; + td.totalMem = totalMem; + return td; + } + + /** Get rate of processing, defined as number of processed records per second. */ + public double getRate() { + double rps = (double) count * 1000.0 / (double) (elapsed>0 ? elapsed : 1); // assume atleast 1ms for any countable op + return rps; + } + + /** Get a short legend for toString() output. */ + public static String getLabels() { + return "# count\telapsed\trec/s\tfreeMem\ttotalMem"; + } + + public String toString() { return toString(true); } + /** + * Return a tab-seprated string containing this data. + * @param withMem if true, append also memory information + * @return The String + */ + public String toString(boolean withMem) { + StringBuffer sb = new StringBuffer(); + sb.append(count + "\t" + elapsed + "\t" + getRate()); + if (withMem) sb.append("\t" + freeMem + "\t" + totalMem); + return sb.toString(); + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java new file mode 100644 index 00000000000..fb18974947d --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java @@ -0,0 +1,175 @@ +package org.apache.lucene.benchmark.utils; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileFilter; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +/** + * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body + */ +public class ExtractReuters +{ + private File reutersDir; + private File outputDir; + private static final String LINE_SEPARATOR = System.getProperty("line.separator"); + + public ExtractReuters(File reutersDir, File outputDir) + { + this.reutersDir = reutersDir; + this.outputDir = outputDir; + System.out.println("Deleting all files in " + outputDir); + File [] files = outputDir.listFiles(); + for (int i = 0; i < files.length; i++) + { + files[i].delete(); + } + + } + + public void extract() + { + File [] sgmFiles = reutersDir.listFiles(new FileFilter() + { + public boolean accept(File file) + { + return file.getName().endsWith(".sgm"); + } + }); + if (sgmFiles != null && sgmFiles.length > 0) + { + for (int i = 0; i < sgmFiles.length; i++) + { + File sgmFile = sgmFiles[i]; + extractFile(sgmFile); + } + } + else + { + System.err.println("No .sgm files in " + reutersDir); + } + } + + Pattern EXTRACTION_PATTERN = Pattern.compile("(.*?)|(.*?)|(.*?)"); + + private static String[] META_CHARS + = {"&", "<", ">", "\"", "'"}; + + private static String[] META_CHARS_SERIALIZATIONS + = {"&", "<", ">", """, "'"}; + + /** + * Override if you wish to change what is extracted + * + * @param sgmFile + */ + protected void extractFile(File sgmFile) + { + try + { + BufferedReader reader = new BufferedReader(new FileReader(sgmFile)); + + StringBuffer buffer = new StringBuffer(1024); + StringBuffer outBuffer = new StringBuffer(1024); + + String line = null; + int index = -1; + int docNumber = 0; + while ((line = reader.readLine()) != null) + { + //when we see a closing reuters tag, flush the file + + if ((index = line.indexOf(" org.apache.lucene.benchmark.utils.ExtractReuters "); + } +} diff --git a/contrib/benchmark/src/java/package.html b/contrib/benchmark/src/java/package.html new file mode 100644 index 00000000000..e671b371b2d --- /dev/null +++ b/contrib/benchmark/src/java/package.html @@ -0,0 +1,44 @@ + + + + Lucene Benchmarking Package + + +

+

+ The benchmark contribution contains tools for benchmarking Lucene using a standard, freely available corpus. ANT will + download the corpus automatically, place it in a temp directory and then unpack it to the working.dir directory specified in the build. + The temp directory + and working directory can be safely removed after a run. However, the next time the task is run, it will need to download the files again. +

+ Classes implementing the Benchmarker interface should have a no-argument constructor if they are to be used with the Driver class. The Driver + class is provided for convenience only. Feel free to implement your own main class for your benchmarker. +

+ The StandardBenchmarker is meant to be just that, a standard that runs out of the box with no configuration or changes needed. + Other benchmarking classes may derive from it to provide alternate views or to take in command line options. When reporting benchmarking runs + you should state any alterations you have made. +

+ To run the short version of the StandardBenchmarker, call "ant run-micro-standard". This should take a minute or so to complete and give you a preliminary idea of how your change affects the code +

+ To run the long version of the StandardBenchmarker, call "ant run-standard". This takes considerably longer. +

+ The original code for these classes was donated by Andrzej Bialecki at http://issues.apache.org/jira/browse/LUCENE-675 and has been updated by Grant Ingersoll to make some parts of the code reusable in other benchmarkers +

+
 
+ + \ No newline at end of file