Applied 788 and 790 from Doron Cohen. Ran both the micro-standard and the task runs and results look reasonable.

Thanks, Doron git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@506093 13f79535-47bb-0310-9956-ffa450edef68
2007-02-11 18:59:22 +00:00 · 2007-02-11 18:59:22 +00:00 · bb66099414
parent 881ab33b81
commit bb66099414
9 changed files with 75 additions and 259 deletions
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -8,4 +8,7 @@ $Id:$

 1. Committed Doron Cohen's benchmarking contribution, which provides an easily expandable task based approach to benchmarking.  See the javadocs for information. (Doron Cohen via Grant Ingersoll)

-2. Added this file.
+2. Added this file.
+
+3. 2/11/07: LUCENE-790 and 788:  Fixed Locale issue with date formatter. Fixed some minor issues with benchmarking by task.  Added a dependency
+ on the Lucene demo to the build classpath.  (Doron Cohen, Grant Ingersoll)
--- a/contrib/benchmark/build.xml
+++ b/contrib/benchmark/build.xml
@ -8,6 +8,7 @@
    <import file="../contrib-build.xml"/>
    <property name="working.dir" value="work"/>

+
    <target name="check-files">

        <available file="temp/news20.tar.gz" property="news20.exists"/>
@ -84,8 +85,11 @@
    <property name="collections.jar" value="commons-collections-3.1.jar"/>
    <property name="logging.jar" value="commons-logging-1.0.4.jar"/>
    <property name="bean-utils.jar" value="commons-beanutils-1.7.0.jar"/>
+    <property name="lucene-demos.jar" location="${common.dir}/build/lucene-demos-${version}.jar"/>
+
    <path id="classpath">
        <pathelement path="${lucene.jar}"/>
+        <pathelement path="${lucene-demos.jar}"/>
        <pathelement path="${basedir}/lib/${digester.jar}"/>
        <pathelement path="${basedir}/lib/${collections.jar}"/>
        <pathelement path="${basedir}/lib/${logging.jar}"/>
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
@ -27,8 +27,17 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
 * Run the benchmark algorithm.
 * <p>Usage: java Benchmark  algorithm-file
 * <ol>
- * <li>Read algorithm.
- * <li> Run the algorithm.
+ * <li>Read algorithm.</li>
+ * <li> Run the algorithm.</li>
+ * </ol>
+ * Things to be added/fixed in "Benchmarking by tasks":
+ * <ol>
+ * <li>TODO - report into Excel and/or graphed view.</li>
+ * <li>TODO - perf comparison between Lucene releases over the years.</li>
+ * <li>TODO - perf report adequate to include in Lucene nightly build site? (so we can easily track performance changes.)</li>
+ * <li>TODO - add overall time control for repeated execution (vs. current by-count only).</li>
+ * <li>TODO - query maker that is based on index statistics.</li>
+ * <li>TODO - prpoerties documentation - each task should document the properties it relies on.</li> 
 * </ol>
 */
 public class Benchmark {
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
@ -23,24 +23,21 @@ import java.io.FileReader;
 import java.text.DateFormat;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
-import java.util.Date;
-import org.apache.lucene.document.DateTools;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
+import java.util.Locale;
+
 import org.apache.lucene.benchmark.byTask.utils.Config;


 /**
 * A DocMaker using the Reuters collection for its input.
 */
-public class ReutersDocMaker extends SimpleDocMaker {
+public class ReutersDocMaker extends BasicDocMaker {

  private DateFormat dateFormat;
  private File dataDir = null;
-  private ArrayList txtFiles = new ArrayList();
+  private ArrayList inputFiles = new ArrayList();
  private int nextFile = 0;
-  private int round=0;
-  private int count = 0;
+  private int iteration=0;
  
  /* (non-Javadoc)
   * @see SimpleDocMaker#setConfig(java.util.Properties)
@ -49,48 +46,28 @@ public class ReutersDocMaker extends SimpleDocMaker {
    super.setConfig(config);
    String d = config.get("docs.dir","reuters-out");
    dataDir = new File(new File("work"),d);
-    addFiles(dataDir);
-    if (txtFiles.size()==0) {
+    collectFiles(dataDir,inputFiles);
+    if (inputFiles.size()==0) {
      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
    }
    // date format: 30-MAR-1987 14:22:36.87
-    dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS");
+    dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
    dateFormat.setLenient(true);
  }

-  private void addFiles(File f) {
-    if (!f.canRead()) {
-      return;
-    }
-    if (f.isDirectory()) {
-      File files[] = f.listFiles();
-      for (int i = 0; i < files.length; i++) {
-        addFiles(files[i]);
-      }
-      return;
-    }
-    txtFiles.add(f);
-    addUniqueBytes(f.length());
-  }
-
-  /* (non-Javadoc)
-   * @see SimpleDocMaker#makeDocument()
-   */
-  public Document makeDocument() throws Exception {
+  protected DocData getNextDocData() throws Exception {
    File f = null;
    String name = null;
    synchronized (this) {
-      f = (File) txtFiles.get(nextFile++);
-      name = f.getCanonicalPath()+"_"+round;
-      if (nextFile >= txtFiles.size()) { 
+      f = (File) inputFiles.get(nextFile++);
+      name = f.getCanonicalPath()+"_"+iteration;
+      if (nextFile >= inputFiles.size()) { 
        // exhausted files, start a new round
        nextFile = 0;
-        round++;
+        iteration++;
      }
    }
    
-    Document doc = new Document();
-    doc.add(new Field("name",name,storeVal,indexVal,termVecVal));
    BufferedReader reader = new BufferedReader(new FileReader(f));
    String line = null;
    //First line is the date, 3rd is the title, rest is body
@ -98,27 +75,23 @@ public class ReutersDocMaker extends SimpleDocMaker {
    reader.readLine();//skip an empty line
    String title = reader.readLine();
    reader.readLine();//skip an empty line
-    StringBuffer body = new StringBuffer(1024);
+    StringBuffer bodyBuf = new StringBuffer(1024);
    while ((line = reader.readLine()) != null) {
-      body.append(line).append(' ');
+      bodyBuf.append(line).append(' ');
    }
-    Date date = dateFormat.parse(dateStr.trim());
-    doc.add(new Field("date", DateTools.dateToString(date, DateTools.Resolution.SECOND), 
-        Field.Store.YES, Field.Index.UN_TOKENIZED));
-
-    if (title != null) {
-      doc.add(new Field("title", title, storeVal,indexVal,termVecVal));
-    }
-    if (body.length() > 0) {
-        doc.add(new Field("body", body.toString(), storeVal,indexVal,termVecVal));
-    }
-
-    count++;
+    
    addBytes(f.length());

-    return doc;
+    DocData dd = new DocData();
+    
+    dd.date = dateFormat.parse(dateStr.trim());
+    dd.name = name;
+    dd.title = title;
+    dd.body = bodyBuf.toString();
+    return dd;
  }

+
  /*
   *  (non-Javadoc)
   * @see DocMaker#resetIinputs()
@ -126,8 +99,7 @@ public class ReutersDocMaker extends SimpleDocMaker {
  public synchronized void resetInputs() {
    super.resetInputs();
    nextFile = 0;
-    round = 0;
-    count = 0;
+    iteration = 0;
  }

  /*
@ -135,22 +107,7 @@ public class ReutersDocMaker extends SimpleDocMaker {
   * @see DocMaker#numUniqueTexts()
   */
  public int numUniqueTexts() {
-    return txtFiles.size();
+    return inputFiles.size();
  }

-  /*
-   *  (non-Javadoc)
-   * @see DocMaker#getCount()
-   */
-  public int getCount() {
-    return count;
-  }
-
-  /*
-   *  (non-Javadoc)
-   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument(int)
-   */
-  public Document makeDocument(int size) throws Exception {
-    throw new Exception(this+".makeDocument (int size) is not supported!");
-  }
 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java
@ -17,29 +17,13 @@ package org.apache.lucene.benchmark.byTask.feeds;
 * limitations under the License.
 */

-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.benchmark.byTask.utils.Format;
-
-
 /**
 * Create documents for the test
 */
-public class SimpleDocMaker implements DocMaker {
+public class SimpleDocMaker extends BasicDocMaker {
  
-  static final String BODY_FIELD = "body";
  private int docID = 0;
-  private long numBytes = 0;
-  private long numUniqueBytes = 0;

-  protected Config config;
-  private int nextDocTextPosition = 0; // for creating docs of fixed size.
-
-  protected Field.Store storeVal = Field.Store.NO;
-  protected Field.Index indexVal = Field.Index.TOKENIZED;
-  protected Field.TermVector termVecVal = Field.TermVector.NO;
-  
  static final String DOC_TEXT = // from a public first aid info at http://firstaid.ie.eu.org 
    "Well it may be a little dramatic but sometimes it true. " +
    "If you call the emergency medical services to an incident, " +
@ -52,100 +36,18 @@ public class SimpleDocMaker implements DocMaker {
    "ones and the stranger whose life may depend on you being in the " +
    "right place at the right time with the right knowledge.";
  
-  private static int DOC_TEXT_LENGTH = DOC_TEXT.length(); 
-
-  /*
-   *  (non-Javadoc)
-   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument()
-   */
-  public Document makeDocument () throws Exception {
-    return makeDocument(0);
-  }
-
-  /*
-   *  (non-Javadoc)
-   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument(int)
-   */
-  public Document makeDocument(int size) throws Exception {
-    int docid = newdocid();
-    Document doc = new Document();
-    doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
-    String docText = createDocText(size);
-    doc.add(new Field(BODY_FIELD, "synthetic body text"+docid+" "+docText, storeVal, indexVal, termVecVal));
-    addBytes(docText.length()); // should multiply by 2 here?
-    return doc;
-  }
-
-  private synchronized int[] nextDocText(int fixedDocSize) {
-    int from = nextDocTextPosition;
-    int to = nextDocTextPosition;
-    int wraps = 0;
-    int size = 0;
-    
-    while (size<fixedDocSize) {
-      int added = DOC_TEXT_LENGTH - to;
-      if (size+added <= fixedDocSize) {
-        to = 0;
-        size += added;
-        wraps ++;
-      } else {
-        added = fixedDocSize - size;
-        size += added;
-        to += added;
-      }
-    }
-    
-    nextDocTextPosition = to;
-    
-    return new int[]{from,to,wraps};
-  }
-  
-  private String createDocText(int fixedDocSize) {
-    if (fixedDocSize<=0) { 
-      //no fixed doc size requirement
-      return DOC_TEXT;
-    } 
-      
-    // create a document wit fixed doc size
-    int fromToWraps[] = nextDocText(fixedDocSize);
-    int from = fromToWraps[0];
-    int to = fromToWraps[1];
-    int wraps = fromToWraps[2];
-    StringBuffer sb = new StringBuffer();
-    while (wraps-- > 0) {
-      sb.append(DOC_TEXT.substring(from));
-      from = 0;
-    }
-    sb.append(DOC_TEXT.substring(from,to));
-    return sb.toString();
-  }
-
  // return a new docid
  private synchronized int newdocid() {
    return docID++;
  }

-  /* (non-Javadoc)
-   * @see DocMaker#setConfig(java.util.Properties)
-   */
-  public void setConfig(Config config) {
-    this.config = config;
-    boolean stored = config.get("doc.stored",false); 
-    boolean tokenized = config.get("doc.tokenized",true);
-    boolean termVec = config.get("doc.term.vector",false);
-    storeVal = (stored ? Field.Store.YES : Field.Store.NO);
-    indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
-    termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
-  }
-
  /*
   *  (non-Javadoc)
   * @see DocMaker#resetIinputs()
   */
  public synchronized void resetInputs() {
-    printDocStatistics();
+    super.resetInputs();
    docID = 0;
-    numBytes = 0;
  }

  /*
@ -156,72 +58,12 @@ public class SimpleDocMaker implements DocMaker {
    return 0; // not applicable
  }

-  /*
-   *  (non-Javadoc)
-   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#numUniqueBytes()
-   */
-  public long numUniqueBytes() {
-    return numUniqueBytes;
-  }
-
-  /*
-   *  (non-Javadoc)
-   * @see DocMaker#getCount()
-   */
-  public int getCount() {
-    return docID;
-  }
-
-  /*
-   *  (non-Javadoc)
-   * @see DocMaker#getByteCount()
-   */
-  public long getByteCount() {
-    return numBytes;
-  }
-
-  protected void addUniqueBytes (long n) {
-    numUniqueBytes += n;
-  }
-  
-  protected void addBytes (long n) {
-    numBytes += n;
-  }
-
-  /*
-   *  (non-Javadoc)
-   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#printDocStatistics()
-   */
-  private int lastPrintedNumUniqueTexts = 0;
-  private long lastPrintedNumUniqueBytes = 0;
-  private int printNum = 0;
-  public void printDocStatistics() {
-    boolean print = false;
-    String col = "                  ";
-    StringBuffer sb = new StringBuffer();
-    String newline = System.getProperty("line.separator");
-    sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline);
-    int nut = numUniqueTexts();
-    if (nut > lastPrintedNumUniqueTexts) {
-      print = true;
-      sb.append("total bytes of unique texts: ").append(Format.format(0,nut,col)).append(newline);
-      lastPrintedNumUniqueTexts = nut;
-    }
-    long nub = numUniqueBytes();
-    if (nub > lastPrintedNumUniqueBytes) {
-      print = true;
-      sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
-      lastPrintedNumUniqueBytes = nub;
-    }
-    if (getCount()>0) {
-      print = true;
-      sb.append("num files added since last inputs reset:   ").append(Format.format(0,getCount(),col)).append(newline);
-      sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getByteCount(),col)).append(newline);
-    }
-    if (print) {
-      System.out.println(sb.append(newline).toString());
-      printNum++;
-    }
+  protected DocData getNextDocData() {
+    DocData dd = new DocData();
+    dd.body = DOC_TEXT;
+    dd.name = "doc"+newdocid();
+    addBytes(DOC_TEXT.length());
+    return dd;
  }

 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
@ -44,7 +44,7 @@ Contained packages:
 </tr>
 <tr>
   <td><a href="feeds/package-summary.html">feeds</a></td>
-   <td>Sources foe benchmark inputs: documents and queries.</td>
+   <td>Sources for benchmark inputs: documents and queries.</td>
 </tr>
 <tr>
   <td><a href="utils/package-summary.html">utils</a></td>
@ -92,7 +92,7 @@ Easiest way to run a benchmarks is using the predefined ant task:
     <br>- would run the <code>compound-penalty.alg</code> "algorithm".
 </li>
 <li>ant run-task -Dtask.alg=[full-path-to-your-alg-file]
-     <br>- would run the <code>your perf test</code> "algorithm".
+     <br>- would run <code>your perf test</code> "algorithm".
 </li>
 <li>java org.apache.lucene.benchmark.byTask.programmatic.Sample
     <br>- would run a performance test programmatically - without using an alg file.
@ -109,7 +109,7 @@ otherwise, you can extend the framework to meet your needs, as explained herein.
 <p>
 Each benchmark run has a DocMaker and a QueryMaker. These two should usually match, so
 that "meaningful" queries are used for a certain collection.
-Properties defined at the header of the alg file define which "makers" should be used.
+Properties set at the header of the alg file define which "makers" should be used.
 You can also specify your own makers, implementing the DocMaker and QureyMaker interfaces.
 </p>

@ -275,8 +275,8 @@ regular index/search work tasks, report tasks, and control tasks.
     <br>This increments a global "round counter". All task runs that would start now would
     record the new, updated round counter as their round number. This would appear in reports.
     In particular, see <font color="#FF0066">RepSumByNameRound</font> above.
-     <br>An additional effect of NewRound, is that numeric and boolean properties defined in the
-     .properties file as a sequence of values, e.g. <font color="#FF0066">merge.factor=mrg:10:100:10:100</font> would
+     <br>An additional effect of NewRound, is that numeric and boolean properties defined (at the head 
+     of the .alg file) as a sequence of values, e.g. <font color="#FF0066">merge.factor=mrg:10:100:10:100</font> would
     increment (cyclic) to the next value.
     Note: this would also be reflected in the reports, in this case under a column that would be named "mrg".
     </li>
@ -368,7 +368,7 @@ Some of the currently defined properties are:
    (Make sure it is no shorter than any value in the sequence).
    <ul>
        <li><font color="#FF0066">max.buffered</font>
-        <br>Example: buffered=buf.10.10.100.100 -
+        <br>Example: max.buffered=buf:10:10:100:100 -
        this would define using maxBufferedDocs of 10 in iterations 0 and 1,
        and 100 in iterations 2 and 3.
        </li>
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java
@ -51,6 +51,10 @@ public class Sample {
    // task to report
    RepSumByNameTask rep = new RepSumByNameTask(runData);
    top.addTask(rep);
+
+    // print algorithm
+    System.out.println(top.toString());
+    
    // execute
    top.doLogic();
  }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
@ -210,6 +210,12 @@ public class TaskSequence extends PerfTask {
   */
  public void setNoChildReport() {
    letChildReport  = false;
+    for (Iterator it = tasks.iterator(); it.hasNext();) {
+      PerfTask task = (PerfTask) it.next();
+      if (task instanceof TaskSequence) {
+        ((TaskSequence)task).setNoChildReport();
+  }
+    }
  }

  /**
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardBenchmarker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/standard/StandardBenchmarker.java
@ -1,19 +1,5 @@
 package org.apache.lucene.benchmark.standard;

-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileFilter;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.InputStream;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Arrays;
-
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.benchmark.AbstractBenchmarker;
@ -33,6 +19,11 @@ import org.apache.lucene.search.Hits;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.store.FSDirectory;
+
+import java.io.*;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.*;
 /**
 * Copyright 2005 The Apache Software Foundation
 *
@ -66,7 +57,7 @@ public class StandardBenchmarker extends AbstractBenchmarker implements Benchmar

    public static final String INDEX_DIR = "index";
    //30-MAR-1987 14:22:36.87
-    private static DateFormat format = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS");
+    private static DateFormat format = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
    //DateFormat.getDateTimeInstance(DateFormat.MEDIUM, DateFormat.SHORT);
    static{
        format.setLenient(true);