LUCENE-2416: Some improvements to Benchmark

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@938582 13f79535-47bb-0310-9956-ffa450edef68
2010-04-27 17:59:45 +00:00 · 2010-04-27 17:59:45 +00:00 · 8dccdd7679
parent af96066c27
commit 8dccdd7679
11 changed files with 145 additions and 79 deletions
--- a/lucene/contrib/benchmark/CHANGES.txt
+++ b/lucene/contrib/benchmark/CHANGES.txt
@ -2,6 +2,12 @@ Lucene Benchmark Contrib Change Log

 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.

+4/27/2010: WriteLineDocTask now supports multi-threading. Also, 
+  StringBufferReader was renamed to StringBuilderReader and works on 
+  StringBuilder now. In addition, LongToEnglishCountentSource starts from 0
+  (instead of Long.MIN_VAL+10) and wraps around to MIN_VAL (if you ever hit 
+  Long.MAX_VAL). (Shai Erera)
+
 4/07/2010
  LUCENE-2377: Enable the use of NoMergePolicy and NoMergeScheduler by 
  CreateIndexTask. (Shai Erera)
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/AbstractQueryMaker.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/AbstractQueryMaker.java
@ -17,7 +17,6 @@ package org.apache.lucene.benchmark.byTask.feeds;

 import org.apache.lucene.search.Query;
 import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.benchmark.byTask.utils.Format;

 /**
 * Abstract base query maker. 
@ -45,7 +44,7 @@ public abstract class AbstractQueryMaker implements QueryMaker {
    StringBuffer sb = new StringBuffer();
    if (queries != null) {
      for (int i = 0; i < queries.length; i++) {
-        sb.append(i+". "+ Format.simpleName(queries[i].getClass())+" - "+queries[i].toString());
+        sb.append(i+". "+ queries[i].getClass().getSimpleName()+" - "+queries[i].toString());
        sb.append(newline);
      }
    }
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java
@ -40,8 +40,6 @@ import java.util.Stack;
 * denotes a relative path (<b>default=work</b>).
 * <li><b>docs.dir</b> - specifies the directory the Dir collection. Can be set
 * to a relative path if "work.dir" is also specified (<b>default=dir-out</b>).
- * <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
- * parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
 * </ul>
 */
 public class DirContentSource extends ContentSource {
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
@ -333,7 +333,7 @@ public class DocMaker {
    String col = "                  ";
    StringBuffer sb = new StringBuffer();
    String newline = System.getProperty("line.separator");
-    sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline);
+    sb.append("------------> ").append(getClass().getSimpleName()).append(" statistics (").append(printNum).append("): ").append(newline);
    int nut = source.getTotalDocsCount();
    if (nut > lastPrintedNumUniqueTexts) {
      print = true;
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishContentSource.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishContentSource.java
@ -1,34 +1,55 @@
 package org.apache.lucene.benchmark.byTask.feeds;

+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 import org.apache.lucene.util.English;

 import java.io.IOException;
 import java.util.Date;

-
 /**
- *
- *
- **/
+ * Creates documents whose content is a <code>long</code> number starting from
+ * <code>{@link Long#MIN_VALUE} + 10</code>.
+ */
 public class LongToEnglishContentSource extends ContentSource{
-  private long counter = Long.MIN_VALUE + 10;
+  private long counter = 0;

  @Override
  public void close() throws IOException {
-
  }
-  //TODO: reduce/clean up synchonization
+  
  @Override
  public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    docData.clear();
-    docData.setBody(English.longToEnglish(counter));
-    docData.setName("doc_" + String.valueOf(counter));
-    docData.setTitle("title_" + String.valueOf(counter));
-    docData.setDate(new Date());
+    // store the current counter to avoid synchronization later on
+    long curCounter;
+    synchronized (this) {
+      curCounter = counter;
      if (counter == Long.MAX_VALUE){
-      counter = Long.MIN_VALUE + 10;//loop around
+        counter = Long.MIN_VALUE;//loop around
+      } else {
+        ++counter;
      }
-    counter++;
+    }    
+    docData.setBody(English.longToEnglish(curCounter));
+    docData.setName("doc_" + String.valueOf(curCounter));
+    docData.setTitle("title_" + String.valueOf(curCounter));
+    docData.setDate(new Date());
    return docData;
  }

@ -36,4 +57,5 @@ public class LongToEnglishContentSource extends ContentSource{
  public void resetInputs() throws IOException {
    counter = Long.MIN_VALUE + 10;
  }
+  
 }
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
@ -32,7 +32,7 @@ import java.util.Locale;
 import java.util.zip.GZIPInputStream;

 import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.benchmark.byTask.utils.StringBufferReader;
+import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
 import org.apache.lucene.util.ThreadInterruptedException;

 /**
@ -53,7 +53,6 @@ import org.apache.lucene.util.ThreadInterruptedException;
 * </ul>
 */
 public class TrecContentSource extends ContentSource {
-  // TODO (3.0): change StringBuffer to StringBuilder

  private static final class DateFormatInfo {
    DateFormat[] dfs;
@ -79,8 +78,8 @@ public class TrecContentSource extends ContentSource {
  };

  private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<DateFormatInfo>();
-  private ThreadLocal<StringBufferReader> trecDocReader = new ThreadLocal<StringBufferReader>();
-  private ThreadLocal<StringBuffer> trecDocBuffer = new ThreadLocal<StringBuffer>();
+  private ThreadLocal<StringBuilderReader> trecDocReader = new ThreadLocal<StringBuilderReader>();
+  private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>();
  private File dataDir = null;
  private ArrayList<File> inputFiles = new ArrayList<File>();
  private int nextFile = 0;
@ -110,19 +109,19 @@ public class TrecContentSource extends ContentSource {
    return dfi;
  }

-  private StringBuffer getDocBuffer() {
-    StringBuffer sb = trecDocBuffer.get();
+  private StringBuilder getDocBuffer() {
+    StringBuilder sb = trecDocBuffer.get();
    if (sb == null) {
-      sb = new StringBuffer();
+      sb = new StringBuilder();
      trecDocBuffer.set(sb);
    }
    return sb;
  }
  
-  private Reader getTrecDocReader(StringBuffer docBuffer) {
-    StringBufferReader r = trecDocReader.get();
+  private Reader getTrecDocReader(StringBuilder docBuffer) {
+    StringBuilderReader r = trecDocReader.get();
    if (r == null) {
-      r = new StringBufferReader(docBuffer);
+      r = new StringBuilderReader(docBuffer);
      trecDocReader.set(r);
    } else {
      r.set(docBuffer);
@ -131,7 +130,7 @@ public class TrecContentSource extends ContentSource {
  }

  // read until finding a line that starts with the specified prefix, or a terminating tag has been found.
-  private void read(StringBuffer buf, String prefix, boolean collectMatchLine,
+  private void read(StringBuilder buf, String prefix, boolean collectMatchLine,
                    boolean collectAll, String terminatingTag)
      throws IOException, NoMoreDataException {
    String sep = "";
@ -248,7 +247,7 @@ public class TrecContentSource extends ContentSource {
        openNextFile();
      }

-      StringBuffer docBuf = getDocBuffer();
+      StringBuilder docBuf = getDocBuffer();
      
      // 1. skip until doc start
      docBuf.setLength(0);
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
@ -23,7 +23,6 @@ import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.stats.Points;
 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
 import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.benchmark.byTask.utils.Format;

 /**
 * An abstract task to be tested for performance. <br>
@ -67,7 +66,7 @@ public abstract class PerfTask implements Cloneable {

  /** Should not be used externally */
  private PerfTask() {
-    name = Format.simpleName(getClass());
+    name = getClass().getSimpleName();
    if (name.endsWith("Task")) {
      name = name.substring(0, name.length() - 4);
    }
@ -99,13 +98,7 @@ public abstract class PerfTask implements Cloneable {
    this.maxDepthLogStart = config.get("task.max.depth.log",0);

    String logStepAtt = "log.step";
-    // TODO (1.5): call getClass().getSimpleName() instead.
-    String taskName = getClass().getName();
-    int idx = taskName.lastIndexOf('.');
-    // To support test internal classes. when we move to getSimpleName, this can be removed.
-    int idx2 = taskName.indexOf('$', idx);
-    if (idx2 != -1) idx = idx2;
-    String taskLogStepAtt = "log.step." + taskName.substring(idx + 1, taskName.length() - 4 /* w/o the 'Task' part */);
+    String taskLogStepAtt = "log.step." + name;
    if (config.get(taskLogStepAtt, null) != null) {
      logStepAtt = taskLogStepAtt;
    }
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
@ -22,6 +22,7 @@ import java.io.BufferedWriter;
 import java.io.FileOutputStream;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

@ -56,8 +57,9 @@ public class WriteLineDocTask extends PerfTask {
  private static final Matcher NORMALIZER = Pattern.compile("[\t\r\n]+").matcher("");

  private int docSize = 0;
-  private BufferedWriter lineFileOut = null;
+  private PrintWriter lineFileOut = null;
  private DocMaker docMaker;
+  private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
  
  public WriteLineDocTask(PerfRunData runData) throws Exception {
    super(runData);
@ -85,7 +87,7 @@ public class WriteLineDocTask extends PerfTask {
      out = new BufferedOutputStream(out, 1 << 16);
      out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out);
    }
-    lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16);
+    lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16));
    docMaker = runData.getDocMaker();
  }

@ -109,12 +111,15 @@ public class WriteLineDocTask extends PerfTask {
      f = doc.getField(DocMaker.DATE_FIELD);
      String date = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : "";
      
-      lineFileOut.write(title, 0, title.length());
-      lineFileOut.write(SEP);
-      lineFileOut.write(date, 0, date.length());
-      lineFileOut.write(SEP);
-      lineFileOut.write(body, 0, body.length());
-      lineFileOut.newLine();
+      StringBuilder sb = threadBuffer.get();
+      if (sb == null) {
+        sb = new StringBuilder();
+        threadBuffer.set(sb);
+      }
+      sb.setLength(0);
+      sb.append(title).append(SEP).append(date).append(SEP).append(body);
+      // lineFileOut is a PrintWriter, which synchronizes internally in println.
+      lineFileOut.println(sb.toString());
    }
    return 1;
  }
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Format.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Format.java
@ -107,19 +107,4 @@ public class Format {
    return res.substring(res.length() - col.length());
  }

-  /**
-   * Extract simple class name 
-   * @param cls class whose simple name is required 
-   * @return simple class name
-   */
-  public static String simpleName (Class<?> cls) {
-    String c = cls.getName();
-    String p = cls.getPackage().getName();
-    int k = c.lastIndexOf(p+".");
-    if (k<0) {
-      return c;
-    }
-    return c.substring(k+1+p.length());
-  }
-  
 }
--- a/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java
+++ b/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java
@ -21,22 +21,22 @@ import java.io.IOException;
 import java.io.Reader;

 /**
- * Implements a {@link Reader} over a {@link StringBuffer} instance. Although
+ * Implements a {@link Reader} over a {@link StringBuilder} instance. Although
 * one can use {@link java.io.StringReader} by passing it
- * {@link StringBuffer#toString()}, it is better to use this class, as it
- * doesn't mark the passed-in {@link StringBuffer} as shared (which will cause
+ * {@link StringBuilder#toString()}, it is better to use this class, as it
+ * doesn't mark the passed-in {@link StringBuilder} as shared (which will cause
 * inner char[] allocations at the next append() attempt).<br>
 * Notes:
 * <ul>
- * <li>This implementation assumes the underlying {@link StringBuffer} is not
+ * <li>This implementation assumes the underlying {@link StringBuilder} is not
 * changed during the use of this {@link Reader} implementation.
 * <li>This implementation is thread-safe.
 * <li>The implementation looks very much like {@link java.io.StringReader} (for
 * the right reasons).
 * <li>If one wants to reuse that instance, then the following needs to be done:
 * <pre>
- * StringBuffer sb = new StringBuffer("some text");
- * Reader reader = new StringBufferReader(sb);
+ * StringBuilder sb = new StringBuilder("some text");
+ * Reader reader = new StringBuilderReader(sb);
 * ... read from reader - don't close it ! ...
 * sb.setLength(0);
 * sb.append("some new text");
@ -45,23 +45,21 @@ import java.io.Reader;
 * </pre>
 * </ul>
 */
-public class StringBufferReader extends Reader {
+public class StringBuilderReader extends Reader {
  
-  // TODO (3.0): change to StringBuffer (including the name of the class)
-  
-  // The StringBuffer to read from.
-  private StringBuffer sb;
+  // The StringBuilder to read from.
+  private StringBuilder sb;

  // The length of 'sb'.
  private int length;

-  // The next position to read from the StringBuffer.
+  // The next position to read from the StringBuilder.
  private int next = 0;

  // The mark position. The default value 0 means the start of the text.
  private int mark = 0;

-  public StringBufferReader(StringBuffer sb) {
+  public StringBuilderReader(StringBuilder sb) {
    set(sb);
  }

@ -85,7 +83,7 @@ public class StringBufferReader extends Reader {
   * 
   * @param readAheadLimit Limit on the number of characters that may be read
   *        while still preserving the mark. Because the stream's input comes
-   *        from a StringBuffer, there is no actual limit, so this argument 
+   *        from a StringBuilder, there is no actual limit, so this argument 
   *        must not be negative, but is otherwise ignored.
   * @exception IllegalArgumentException If readAheadLimit is < 0
   * @exception IOException If an I/O error occurs
@ -156,7 +154,7 @@ public class StringBufferReader extends Reader {
    }
  }

-  public void set(StringBuffer sb) {
+  public void set(StringBuilder sb) {
    synchronized (lock) {
      this.sb = sb;
      length = sb.length();
--- a/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
+++ b/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
@ -22,7 +22,9 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.util.HashSet;
 import java.util.Properties;
+import java.util.Set;

 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.lucene.benchmark.BenchmarkTestCase;
@ -97,6 +99,21 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
    }
  }
  
+  // class has to be public so that Class.forName.newInstance() will work
+  public static final class ThreadingDocMaker extends DocMaker {
+  
+    @Override
+    public Document makeDocument() throws Exception {
+      Document doc = new Document();
+      String name = Thread.currentThread().getName();
+      doc.add(new Field(BODY_FIELD, "body_" + name, Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      doc.add(new Field(TITLE_FIELD, "title_" + name, Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      doc.add(new Field(DATE_FIELD, "date_" + name, Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      return doc;
+    }
+    
+  }
+
  private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();

  private PerfRunData createPerfRunData(File file, boolean setBZCompress,
@ -226,4 +243,48 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
    }
  }

+  public void testMultiThreaded() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    PerfRunData runData = createPerfRunData(file, false, null, ThreadingDocMaker.class.getName());
+    final WriteLineDocTask wldt = new WriteLineDocTask(runData);
+    Thread[] threads = new Thread[10];
+    for (int i = 0; i < threads.length; i++) {
+      threads[i] = new Thread("t" + i) {
+        @Override
+        public void run() {
+          try {
+            wldt.doLogic();
+          } catch (Exception e) {
+            throw new RuntimeException(e);
+          }
+        }
+      };
+    }
+    
+    for (Thread t : threads) t.start();
+    for (Thread t : threads) t.join();
+    
+    wldt.close();
+    
+    Set<String> ids = new HashSet<String>();
+    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
+    try {
+      for (int i = 0; i < threads.length; i++) {
+        String line = br.readLine();
+        String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
+        assertEquals(3, parts.length);
+        // check that all thread names written are the same in the same line
+        String tname = parts[0].substring(parts[0].indexOf('_'));
+        ids.add(tname);
+        assertEquals(tname, parts[1].substring(parts[1].indexOf('_')));
+        assertEquals(tname, parts[2].substring(parts[2].indexOf('_')));
+      }
+      // only threads.length lines should exist
+      assertNull(br.readLine());
+      assertEquals(threads.length, ids.size());
+    } finally {
+      br.close();
+    }
+  }
+
 }