diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt index 895d61b00f8..7f5351ec858 100644 --- a/contrib/benchmark/CHANGES.txt +++ b/contrib/benchmark/CHANGES.txt @@ -8,4 +8,7 @@ $Id:$ 1. Committed Doron Cohen's benchmarking contribution, which provides an easily expandable task based approach to benchmarking. See the javadocs for information. (Doron Cohen via Grant Ingersoll) -2. Added this file. \ No newline at end of file +2. Added this file. + +3. 2/11/07: LUCENE-790 and 788: Fixed Locale issue with date formatter. Fixed some minor issues with benchmarking by task. Added a dependency + on the Lucene demo to the build classpath. (Doron Cohen, Grant Ingersoll) \ No newline at end of file diff --git a/contrib/benchmark/build.xml b/contrib/benchmark/build.xml index 05cb621fc9a..70591434e78 100644 --- a/contrib/benchmark/build.xml +++ b/contrib/benchmark/build.xml @@ -8,6 +8,7 @@ + @@ -84,8 +85,11 @@ + + + diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java index 28f758d0fa2..8a3c9c15b39 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java @@ -27,8 +27,17 @@ import org.apache.lucene.benchmark.byTask.utils.Config; * Run the benchmark algorithm. *

Usage: java Benchmark algorithm-file *

    - *
  1. Read algorithm. - *
  2. Run the algorithm. + *
  3. Read algorithm.
  4. + *
  5. Run the algorithm.
  6. + *
+ * Things to be added/fixed in "Benchmarking by tasks": + *
    + *
  1. TODO - report into Excel and/or graphed view.
  2. + *
  3. TODO - perf comparison between Lucene releases over the years.
  4. + *
  5. TODO - perf report adequate to include in Lucene nightly build site? (so we can easily track performance changes.)
  6. + *
  7. TODO - add overall time control for repeated execution (vs. current by-count only).
  8. + *
  9. TODO - query maker that is based on index statistics.
  10. + *
  11. TODO - prpoerties documentation - each task should document the properties it relies on.
  12. *
*/ public class Benchmark { diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java index bb4c07eed08..0b12c87a83a 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java @@ -23,24 +23,21 @@ import java.io.FileReader; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; -import java.util.Date; -import org.apache.lucene.document.DateTools; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; +import java.util.Locale; + import org.apache.lucene.benchmark.byTask.utils.Config; /** * A DocMaker using the Reuters collection for its input. */ -public class ReutersDocMaker extends SimpleDocMaker { +public class ReutersDocMaker extends BasicDocMaker { private DateFormat dateFormat; private File dataDir = null; - private ArrayList txtFiles = new ArrayList(); + private ArrayList inputFiles = new ArrayList(); private int nextFile = 0; - private int round=0; - private int count = 0; + private int iteration=0; /* (non-Javadoc) * @see SimpleDocMaker#setConfig(java.util.Properties) @@ -49,48 +46,28 @@ public class ReutersDocMaker extends SimpleDocMaker { super.setConfig(config); String d = config.get("docs.dir","reuters-out"); dataDir = new File(new File("work"),d); - addFiles(dataDir); - if (txtFiles.size()==0) { + collectFiles(dataDir,inputFiles); + if (inputFiles.size()==0) { throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); } // date format: 30-MAR-1987 14:22:36.87 - dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS"); + dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US); dateFormat.setLenient(true); } - private void addFiles(File f) { - if (!f.canRead()) { - return; - } - if (f.isDirectory()) { - File files[] = f.listFiles(); - for (int i = 0; i < files.length; i++) { - addFiles(files[i]); - } - return; - } - txtFiles.add(f); - addUniqueBytes(f.length()); - } - - /* (non-Javadoc) - * @see SimpleDocMaker#makeDocument() - */ - public Document makeDocument() throws Exception { + protected DocData getNextDocData() throws Exception { File f = null; String name = null; synchronized (this) { - f = (File) txtFiles.get(nextFile++); - name = f.getCanonicalPath()+"_"+round; - if (nextFile >= txtFiles.size()) { + f = (File) inputFiles.get(nextFile++); + name = f.getCanonicalPath()+"_"+iteration; + if (nextFile >= inputFiles.size()) { // exhausted files, start a new round nextFile = 0; - round++; + iteration++; } } - Document doc = new Document(); - doc.add(new Field("name",name,storeVal,indexVal,termVecVal)); BufferedReader reader = new BufferedReader(new FileReader(f)); String line = null; //First line is the date, 3rd is the title, rest is body @@ -98,27 +75,23 @@ public class ReutersDocMaker extends SimpleDocMaker { reader.readLine();//skip an empty line String title = reader.readLine(); reader.readLine();//skip an empty line - StringBuffer body = new StringBuffer(1024); + StringBuffer bodyBuf = new StringBuffer(1024); while ((line = reader.readLine()) != null) { - body.append(line).append(' '); + bodyBuf.append(line).append(' '); } - Date date = dateFormat.parse(dateStr.trim()); - doc.add(new Field("date", DateTools.dateToString(date, DateTools.Resolution.SECOND), - Field.Store.YES, Field.Index.UN_TOKENIZED)); - - if (title != null) { - doc.add(new Field("title", title, storeVal,indexVal,termVecVal)); - } - if (body.length() > 0) { - doc.add(new Field("body", body.toString(), storeVal,indexVal,termVecVal)); - } - - count++; + addBytes(f.length()); - return doc; + DocData dd = new DocData(); + + dd.date = dateFormat.parse(dateStr.trim()); + dd.name = name; + dd.title = title; + dd.body = bodyBuf.toString(); + return dd; } + /* * (non-Javadoc) * @see DocMaker#resetIinputs() @@ -126,8 +99,7 @@ public class ReutersDocMaker extends SimpleDocMaker { public synchronized void resetInputs() { super.resetInputs(); nextFile = 0; - round = 0; - count = 0; + iteration = 0; } /* @@ -135,22 +107,7 @@ public class ReutersDocMaker extends SimpleDocMaker { * @see DocMaker#numUniqueTexts() */ public int numUniqueTexts() { - return txtFiles.size(); + return inputFiles.size(); } - /* - * (non-Javadoc) - * @see DocMaker#getCount() - */ - public int getCount() { - return count; - } - - /* - * (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument(int) - */ - public Document makeDocument(int size) throws Exception { - throw new Exception(this+".makeDocument (int size) is not supported!"); - } } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java index f3293ea8208..5628460e710 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java @@ -17,29 +17,13 @@ package org.apache.lucene.benchmark.byTask.feeds; * limitations under the License. */ -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.benchmark.byTask.utils.Config; -import org.apache.lucene.benchmark.byTask.utils.Format; - - /** * Create documents for the test */ -public class SimpleDocMaker implements DocMaker { +public class SimpleDocMaker extends BasicDocMaker { - static final String BODY_FIELD = "body"; private int docID = 0; - private long numBytes = 0; - private long numUniqueBytes = 0; - protected Config config; - private int nextDocTextPosition = 0; // for creating docs of fixed size. - - protected Field.Store storeVal = Field.Store.NO; - protected Field.Index indexVal = Field.Index.TOKENIZED; - protected Field.TermVector termVecVal = Field.TermVector.NO; - static final String DOC_TEXT = // from a public first aid info at http://firstaid.ie.eu.org "Well it may be a little dramatic but sometimes it true. " + "If you call the emergency medical services to an incident, " + @@ -52,100 +36,18 @@ public class SimpleDocMaker implements DocMaker { "ones and the stranger whose life may depend on you being in the " + "right place at the right time with the right knowledge."; - private static int DOC_TEXT_LENGTH = DOC_TEXT.length(); - - /* - * (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument() - */ - public Document makeDocument () throws Exception { - return makeDocument(0); - } - - /* - * (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument(int) - */ - public Document makeDocument(int size) throws Exception { - int docid = newdocid(); - Document doc = new Document(); - doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal)); - String docText = createDocText(size); - doc.add(new Field(BODY_FIELD, "synthetic body text"+docid+" "+docText, storeVal, indexVal, termVecVal)); - addBytes(docText.length()); // should multiply by 2 here? - return doc; - } - - private synchronized int[] nextDocText(int fixedDocSize) { - int from = nextDocTextPosition; - int to = nextDocTextPosition; - int wraps = 0; - int size = 0; - - while (size 0) { - sb.append(DOC_TEXT.substring(from)); - from = 0; - } - sb.append(DOC_TEXT.substring(from,to)); - return sb.toString(); - } - // return a new docid private synchronized int newdocid() { return docID++; } - /* (non-Javadoc) - * @see DocMaker#setConfig(java.util.Properties) - */ - public void setConfig(Config config) { - this.config = config; - boolean stored = config.get("doc.stored",false); - boolean tokenized = config.get("doc.tokenized",true); - boolean termVec = config.get("doc.term.vector",false); - storeVal = (stored ? Field.Store.YES : Field.Store.NO); - indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED); - termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO); - } - /* * (non-Javadoc) * @see DocMaker#resetIinputs() */ public synchronized void resetInputs() { - printDocStatistics(); + super.resetInputs(); docID = 0; - numBytes = 0; } /* @@ -156,72 +58,12 @@ public class SimpleDocMaker implements DocMaker { return 0; // not applicable } - /* - * (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#numUniqueBytes() - */ - public long numUniqueBytes() { - return numUniqueBytes; - } - - /* - * (non-Javadoc) - * @see DocMaker#getCount() - */ - public int getCount() { - return docID; - } - - /* - * (non-Javadoc) - * @see DocMaker#getByteCount() - */ - public long getByteCount() { - return numBytes; - } - - protected void addUniqueBytes (long n) { - numUniqueBytes += n; - } - - protected void addBytes (long n) { - numBytes += n; - } - - /* - * (non-Javadoc) - * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#printDocStatistics() - */ - private int lastPrintedNumUniqueTexts = 0; - private long lastPrintedNumUniqueBytes = 0; - private int printNum = 0; - public void printDocStatistics() { - boolean print = false; - String col = " "; - StringBuffer sb = new StringBuffer(); - String newline = System.getProperty("line.separator"); - sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline); - int nut = numUniqueTexts(); - if (nut > lastPrintedNumUniqueTexts) { - print = true; - sb.append("total bytes of unique texts: ").append(Format.format(0,nut,col)).append(newline); - lastPrintedNumUniqueTexts = nut; - } - long nub = numUniqueBytes(); - if (nub > lastPrintedNumUniqueBytes) { - print = true; - sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline); - lastPrintedNumUniqueBytes = nub; - } - if (getCount()>0) { - print = true; - sb.append("num files added since last inputs reset: ").append(Format.format(0,getCount(),col)).append(newline); - sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getByteCount(),col)).append(newline); - } - if (print) { - System.out.println(sb.append(newline).toString()); - printNum++; - } + protected DocData getNextDocData() { + DocData dd = new DocData(); + dd.body = DOC_TEXT; + dd.name = "doc"+newdocid(); + addBytes(DOC_TEXT.length()); + return dd; } } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html index cc1b3779cb4..7f9baaeea24 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html @@ -44,7 +44,7 @@ Contained packages: feeds - Sources foe benchmark inputs: documents and queries. + Sources for benchmark inputs: documents and queries. utils @@ -92,7 +92,7 @@ Easiest way to run a benchmarks is using the predefined ant task:
- would run the compound-penalty.alg "algorithm".
  • ant run-task -Dtask.alg=[full-path-to-your-alg-file] -
    - would run the your perf test "algorithm". +
    - would run your perf test "algorithm".
  • java org.apache.lucene.benchmark.byTask.programmatic.Sample
    - would run a performance test programmatically - without using an alg file. @@ -109,7 +109,7 @@ otherwise, you can extend the framework to meet your needs, as explained herein.

    Each benchmark run has a DocMaker and a QueryMaker. These two should usually match, so that "meaningful" queries are used for a certain collection. -Properties defined at the header of the alg file define which "makers" should be used. +Properties set at the header of the alg file define which "makers" should be used. You can also specify your own makers, implementing the DocMaker and QureyMaker interfaces.

    @@ -275,8 +275,8 @@ regular index/search work tasks, report tasks, and control tasks.
    This increments a global "round counter". All task runs that would start now would record the new, updated round counter as their round number. This would appear in reports. In particular, see RepSumByNameRound above. -
    An additional effect of NewRound, is that numeric and boolean properties defined in the - .properties file as a sequence of values, e.g. merge.factor=mrg:10:100:10:100 would +
    An additional effect of NewRound, is that numeric and boolean properties defined (at the head + of the .alg file) as a sequence of values, e.g. merge.factor=mrg:10:100:10:100 would increment (cyclic) to the next value. Note: this would also be reflected in the reports, in this case under a column that would be named "mrg".
  • @@ -368,7 +368,7 @@ Some of the currently defined properties are: (Make sure it is no shorter than any value in the sequence).
    • max.buffered -
      Example: buffered=buf.10.10.100.100 - +
      Example: max.buffered=buf:10:10:100:100 - this would define using maxBufferedDocs of 10 in iterations 0 and 1, and 100 in iterations 2 and 3.
    • diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java index 09118e768cc..55a881a3b23 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java @@ -51,6 +51,10 @@ public class Sample { // task to report RepSumByNameTask rep = new RepSumByNameTask(runData); top.addTask(rep); + + // print algorithm + System.out.println(top.toString()); + // execute top.doLogic(); } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java index dedd68ad469..d1bf82122dc 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java @@ -210,6 +210,12 @@ public class TaskSequence extends PerfTask { */ public void setNoChildReport() { letChildReport = false; + for (Iterator it = tasks.iterator(); it.hasNext();) { + PerfTask task = (PerfTask) it.next(); + if (task instanceof TaskSequence) { + ((TaskSequence)task).setNoChildReport(); + } + } } /** diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardBenchmarker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardBenchmarker.java index 18d2f3d5cf8..b4567769dbf 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardBenchmarker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardBenchmarker.java @@ -1,19 +1,5 @@ package org.apache.lucene.benchmark.standard; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileFilter; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.InputStream; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.Iterator; -import java.util.List; -import java.util.Arrays; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.benchmark.AbstractBenchmarker; @@ -33,6 +19,11 @@ import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.store.FSDirectory; + +import java.io.*; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.*; /** * Copyright 2005 The Apache Software Foundation * @@ -66,7 +57,7 @@ public class StandardBenchmarker extends AbstractBenchmarker implements Benchmar public static final String INDEX_DIR = "index"; //30-MAR-1987 14:22:36.87 - private static DateFormat format = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS"); + private static DateFormat format = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US); //DateFormat.getDateTimeInstance(DateFormat.MEDIUM, DateFormat.SHORT); static{ format.setLenient(true);