LUCENE-947: add creation of & indexing from 'one document per line' text files to minimize IO overhead of creating documents when running tests

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@559366 13f79535-47bb-0310-9956-ffa450edef68
2007-07-25 08:54:58 +00:00 · 2007-07-25 08:54:58 +00:00 · 02dd452026
parent 2d16613438
commit 02dd452026
17 changed files with 553 additions and 26 deletions
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -4,6 +4,11 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
 $Id:$
 7/24/07
  LUCENE-947: Add support for creating and index "one document per
  line" from a large text file, which reduces per-document overhead of
  opening a single file for each document.
 6/30/07
  LUCENE-848: Added support for Wikipedia benchmarking.
--- a/contrib/benchmark/conf/createLineFile.alg
+++ b/contrib/benchmark/conf/createLineFile.alg
@ -0,0 +1,43 @@
 #/**
 # * Licensed to the Apache Software Foundation (ASF) under one or more
 # * contributor license agreements.  See the NOTICE file distributed with
 # * this work for additional information regarding copyright ownership.
 # * The ASF licenses this file to You under the Apache License, Version 2.0
 # * (the "License"); you may not use this file except in compliance with
 # * the License.  You may obtain a copy of the License at
 # *
 # *     http://www.apache.org/licenses/LICENSE-2.0
 # *
 # * Unless required by applicable law or agreed to in writing, software
 # * distributed under the License is distributed on an "AS IS" BASIS,
 # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # * See the License for the specific language governing permissions and
 # * limitations under the License.
 # */
 # -------------------------------------------------------------------------------------
 #
 # This alg will process the Reuters documents feed to produce a
 # single file that contains all documents, one per line.
 #
 # To use this, first cd to contrib/benchmark and then run:
 #
 #   ant run-task -Dtask.alg=conf/createLineFile.alg
 #
 # Then, to index the documents in the line file, see
 # indexLineFile.alg.
 #
 # Where to get documents from:
 doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
 # Where to write the line file output:
 line.file.out=work/reuters.lines.txt
 # Stop after processing the document feed once:
 doc.maker.forever=false
 # -------------------------------------------------------------------------------------
 # Process all documents, appending each one to the line file:
 {WriteLineDoc()}: * 
--- a/contrib/benchmark/conf/indexLineFile.alg
+++ b/contrib/benchmark/conf/indexLineFile.alg
@ -0,0 +1,53 @@
 #/**
 # * Licensed to the Apache Software Foundation (ASF) under one or more
 # * contributor license agreements.  See the NOTICE file distributed with
 # * this work for additional information regarding copyright ownership.
 # * The ASF licenses this file to You under the Apache License, Version 2.0
 # * (the "License"); you may not use this file except in compliance with
 # * the License.  You may obtain a copy of the License at
 # *
 # *     http://www.apache.org/licenses/LICENSE-2.0
 # *
 # * Unless required by applicable law or agreed to in writing, software
 # * distributed under the License is distributed on an "AS IS" BASIS,
 # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # * See the License for the specific language governing permissions and
 # * limitations under the License.
 # */
 # -------------------------------------------------------------------------------------
 #
 # This file indexes documents contained in a single text file, one per
 # line.  See createLineFile.alg for how to create this file.  The
 # benefit of this is it removes the IO cost of opening one file per
 # document to let you more accurately measure time spent analyzing and
 # indexing your documents vs time spent creating the documents.
 #
 # To use this, you must first run the createLineFile.alg, then cd to
 # contrib/benchmark and then run:
 #
 #   ant run-task -Dtask.alg=conf/indexLineFile.alg
 #
 analyzer=org.apache.lucene.analysis.SimpleAnalyzer
 # Feed that knows how to process the line file format:
 doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
 # File that contains one document per line:
 docs.file=work/reuters.lines.txt
 # Process documents only once:
 doc.maker.forever=false
 # -------------------------------------------------------------------------------------
 # Reset the system, create a new index, index all docs from the line
 # file, close the index, produce a report.
 ResetSystemErase
 CreateIndex
 {AddDoc}: *
 CloseIndex
 RepSumByPref AddDoc 
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
@ -70,6 +70,7 @@ public class PerfRunData {
  private IndexReader indexReader;
  private IndexWriter indexWriter;
  private Config config;
  private long startTimeMillis;
  // constructor
  public PerfRunData (Config config) throws Exception {
@ -136,6 +137,15 @@ public class PerfRunData {
    // release unused stuff
    System.runFinalization();
    System.gc();
    startTimeMillis = System.currentTimeMillis();
  }
  /**
   * @return Start time in milliseconds
   */
  public long getStartTimeMillis() {
    return startTimeMillis;
  }
  /**
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
@ -39,6 +39,8 @@ import java.util.Iterator;
 * doc.stored=true|FALSE<br/>
 * doc.tokenized=TRUE|false<br/>
 * doc.term.vector=true|FALSE<br/>
 * doc.term.vector.positions=true|FALSE<br/>
 * doc.term.vector.offsets=true|FALSE<br/>
 * doc.store.body.bytes=true|FALSE //Store the body contents raw UTF-8 bytes as a field<br/>
 */
 public abstract class BasicDocMaker implements DocMaker {
@ -55,7 +57,13 @@ public abstract class BasicDocMaker implements DocMaker {
  // leftovers are thread local, because it is unsafe to share residues between threads
  private ThreadLocal leftovr = new ThreadLocal();
-  static final String BODY_FIELD = "body";
+  public static final String BODY_FIELD = "body";
  public static final String TITLE_FIELD = "doctitle";
  public static final String DATE_FIELD = "docdate";
  public static final String ID_FIELD = "docid";
  public static final String BYTES_FIELD = "bytes";
  public static final String NAME_FIELD = "docname";
  private long numBytes = 0;
  private long numUniqueBytes = 0;
@ -97,17 +105,17 @@ public abstract class BasicDocMaker implements DocMaker {
  private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
    int docid = incrNumDocsCreated();
    Document doc = new Document();
-    doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
+    doc.add(new Field(ID_FIELD, "doc"+docid, storeVal, indexVal, termVecVal));
    if (docData.getName()!=null) {
      String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
-      doc.add(new Field("docname", name, storeVal, indexVal, termVecVal));
+      doc.add(new Field(NAME_FIELD, name, storeVal, indexVal, termVecVal));
    }
    if (docData.getDate()!=null) {
      String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
-      doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal));
+      doc.add(new Field(DATE_FIELD, dateStr, storeVal, indexVal, termVecVal));
    }
    if (docData.getTitle()!=null) {
-      doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal));
+      doc.add(new Field(TITLE_FIELD, docData.getTitle(), storeVal, indexVal, termVecVal));
    }
    if (docData.getBody()!=null && docData.getBody().length()>0) {
      String bdy;
@ -127,7 +135,7 @@ public abstract class BasicDocMaker implements DocMaker {
      }
      doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
      if (storeBytes == true) {
-        doc.add(new Field("bytes", bdy.getBytes("UTF-8"), Field.Store.YES));
+        doc.add(new Field(BYTES_FIELD, bdy.getBytes("UTF-8"), Field.Store.YES));
      }
    }
@ -188,7 +196,18 @@ public abstract class BasicDocMaker implements DocMaker {
    boolean termVec = config.get("doc.term.vector",false);
    storeVal = (stored ? Field.Store.YES : Field.Store.NO);
    indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
-    termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
+    boolean termVecPositions = config.get("doc.term.vector.positions",false);
    boolean termVecOffsets = config.get("doc.term.vector.offsets",false);
    if (termVecPositions && termVecOffsets)
      termVecVal = Field.TermVector.WITH_POSITIONS_OFFSETS;
    else if (termVecPositions)
      termVecVal = Field.TermVector.WITH_POSITIONS;
    else if (termVecOffsets)
      termVecVal = Field.TermVector.WITH_OFFSETS;
    else if (termVec)
      termVecVal = Field.TermVector.YES;
    else
      termVecVal = Field.TermVector.NO;
    storeBytes = config.get("doc.store.body.bytes", false);
    forever = config.get("doc.maker.forever",true);
  }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
@ -40,7 +40,7 @@ import java.util.Stack;
 */
 public class DirDocMaker extends BasicDocMaker {
-  private DateFormat dateFormat;
+  private ThreadLocal dateFormat = new ThreadLocal();
  private File dataDir = null;
  private int iteration=0;
@ -148,11 +148,21 @@ public class DirDocMaker extends BasicDocMaker {
    if (inputFiles==null) {
      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
    }
    // date format: 30-MAR-1987 14:22:36
    dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss",Locale.US);
    dateFormat.setLenient(true);
  }
  // get/initiate a thread-local simple date format (must do so 
  // because SimpleDateFormat is not thread-safe).
  protected DateFormat getDateFormat () {
    DateFormat df = (DateFormat) dateFormat.get();
    if (df == null) {
      // date format: 30-MAR-1987 14:22:36.87
      df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
      df.setLenient(true);
      dateFormat.set(df);
    }
    return df;
  }
  protected DocData getNextDocData() throws Exception {
    File f = null;
    String name = null;
@ -184,7 +194,7 @@ public class DirDocMaker extends BasicDocMaker {
    reader.close();
    addBytes(f.length());
-    Date date = dateFormat.parse(dateStr.trim()); 
+    Date date = getDateFormat().parse(dateStr.trim()); 
    return new DocData(name, bodyBuf.toString(), title, null, date);
  }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
@ -46,7 +46,7 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake
    Analyzer anlzr = (Analyzer) Class.forName(config.get("analyzer",
            "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance();
-    String defaultField = config.get("file.query.maker.default.field", "body");
+    String defaultField = config.get("file.query.maker.default.field", BasicDocMaker.BODY_FIELD);
    QueryParser qp = new QueryParser(defaultField, anlzr);
    List qq = new ArrayList();
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
@ -0,0 +1,159 @@
 package org.apache.lucene.benchmark.byTask.feeds;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.FileReader;
 /**
 * A DocMaker reading one line at a time as a Document from
 * a single file.  This saves IO cost (over DirDocMaker) of
 * recursing through a directory and opening a new file for
 * every document.  It also re-uses its Document and Field
 * instance to improve indexing speed.
 *
 * Config properties:
 * docs.file=&lt;path to the file%gt;
 */
 public class LineDocMaker extends BasicDocMaker {
  private BufferedReader fileIn;
  private ThreadLocal docState = new ThreadLocal();
  private String fileName;
  private static int READER_BUFFER_BYTES = 64*1024;
  private class DocState {
    Document doc;
    Field bodyField;
    Field titleField;
    Field dateField;
    public DocState() {
      bodyField = new Field(BasicDocMaker.BODY_FIELD,
                            "",
                            storeVal,
                            Field.Index.TOKENIZED,
                            termVecVal);
      titleField = new Field(BasicDocMaker.TITLE_FIELD,
                             "",
                             storeVal,
                             Field.Index.TOKENIZED,
                             termVecVal);
      dateField = new Field(BasicDocMaker.TITLE_FIELD,
                            "",
                            storeVal,
                            Field.Index.TOKENIZED,
                            termVecVal);
      doc = new Document();
      doc.add(bodyField);
      doc.add(titleField);
      doc.add(dateField);
    }
    final static String SEP = WriteLineDocTask.SEP;
    public Document setFields(String line) {
      // title <TAB> date <TAB> body <NEWLINE>
      int spot = line.indexOf(SEP);
      titleField.setValue(line.substring(0, spot));
      int spot2 = line.indexOf(SEP, 1+spot);
      dateField.setValue(line.substring(1+spot, spot2));
      bodyField.setValue(line.substring(1+spot2, line.length()));
      return doc;
    }
  }
  /* (non-Javadoc)
   * @see SimpleDocMaker#setConfig(java.util.Properties)
   */
  public void setConfig(Config config) {
    super.setConfig(config);
    resetInputs();
  }
  protected DocData getNextDocData() throws Exception {
    throw new RuntimeException("not implemented");
  }
  private DocState getDocState() {
    DocState ds = (DocState) docState.get();
    if (ds == null) {
      ds = new DocState();
      docState.set(ds);
    }
    return ds;
  }
  public Document makeDocument() throws Exception {
    String line;
    synchronized(this) {
      while(true) {
        line = fileIn.readLine();
        if (line == null) {
          if (!forever)
            throw new NoMoreDataException();
          else {
            // Reset the file
            openFile();
          }
        } else {
          break;
        }
      }
    }
    return getDocState().setFields(line);
  }
  public Document makeDocument(int size) throws Exception {
    throw new RuntimeException("cannot change document size with LineDocMaker; please use DirDocMaker instead");
  }
  public synchronized void resetInputs() {
    super.resetInputs();
    fileName = config.get("docs.file", null);
    if (fileName == null)
      throw new RuntimeException("docs.file must be set");
    openFile();
  }
  private void openFile() {
    try {
      if (fileIn != null)
        fileIn.close();
      fileIn = new BufferedReader(new FileReader(fileName), READER_BUFFER_BYTES);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
  public int numUniqueTexts() {
    return -1;
  }
 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
@ -71,7 +71,7 @@ public class ReutersQueryMaker extends AbstractQueryMaker implements QueryMaker
   * @return array of Lucene queries
   */
  private static Query[] createQueries(List qs, Analyzer a) {
-    QueryParser qp = new QueryParser("body", a);
+    QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD, a);
    List queries = new ArrayList();
    for (int i = 0; i < qs.size(); i++)  {
      try {
@ -107,7 +107,7 @@ public class ReutersQueryMaker extends AbstractQueryMaker implements QueryMaker
    List queryList = new ArrayList(20);
    queryList.addAll(Arrays.asList(STANDARD_QUERIES));
-    queryList.addAll(Arrays.asList(getPrebuiltQueries("body")));
+    queryList.addAll(Arrays.asList(getPrebuiltQueries(BasicDocMaker.BODY_FIELD)));
    return createQueries(queryList, anlzr);
  }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
@ -45,11 +45,11 @@ public class SimpleQueryMaker extends AbstractQueryMaker implements QueryMaker {
    Analyzer anlzr= (Analyzer) Class.forName(config.get("analyzer",
        "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance(); 
-    QueryParser qp = new QueryParser("body",anlzr);
+    QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD,anlzr);
    ArrayList qq = new ArrayList();
-    Query q1 = new TermQuery(new Term("docid","doc2"));
+    Query q1 = new TermQuery(new Term(BasicDocMaker.ID_FIELD,"doc2"));
    qq.add(q1);
-    Query q2 = new TermQuery(new Term("body","simple"));
+    Query q2 = new TermQuery(new Term(BasicDocMaker.BODY_FIELD,"simple"));
    qq.add(q2);
    BooleanQuery bq = new BooleanQuery();
    bq.add(q1,Occur.MUST);
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
@ -519,6 +519,8 @@ Here is a list of currently defined properties:
    </li><li>doc.stored
    </li><li>doc.tokenized
    </li><li>doc.term.vector
    </li><li>doc.term.vector.positions
    </li><li>doc.term.vector.offsets
    </li><li>doc.store.body.bytes
    </li><li>docs.dir
    </li><li>query.maker
@ -540,6 +542,8 @@ Here is a list of currently defined properties:
    </li><li>merge.factor
    </li><li>max.buffered
    </li><li>directory
    </li><li>ram.flush.mb
    </li><li>autocommit
    </li></ul>
  </li>
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.document.Document;
 import java.text.NumberFormat;
 /**
@ -81,7 +82,10 @@ public class AddDocTask extends PerfTask {
      logStep = getRunData().getConfig().get("doc.add.log.step",DEFAULT_ADD_DOC_LOG_STEP);
    }
    if (logStep>0 && (count%logStep)==0) {
-      System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs");
+      double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
      NumberFormat nf = NumberFormat.getInstance();
      nf.setMaximumFractionDigits(2);
      System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs");
    }
  }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
@ -30,7 +30,8 @@ import java.io.IOException;
 * Create an index.
 * <br>Other side effects: index writer object in perfRunData is set.
 * <br>Relevant properties: <code>merge.factor, max.buffered,
- *  max.field.length</code>.
+ *  max.field.length, ram.flush.mb [default 0], autocommit
 *  [default true]</code>.
 */
 public class CreateIndexTask extends PerfTask {
@ -42,19 +43,23 @@ public class CreateIndexTask extends PerfTask {
    Directory dir = getRunData().getDirectory();
    Analyzer analyzer = getRunData().getAnalyzer();
    IndexWriter iw = new IndexWriter(dir, analyzer, true);
    Config config = getRunData().getConfig();
    boolean cmpnd = config.get("compound",true);
    int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
    int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
    int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
    double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
    boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
    IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true);
    iw.setUseCompoundFile(cmpnd);
    iw.setMergeFactor(mrgf);
    iw.setMaxBufferedDocs(mxbf);
    iw.setMaxFieldLength(mxfl);
    if (flushAtRAMUsage > 0)
      iw.setRAMBufferSizeMB(flushAtRAMUsage);
    getRunData().setIndexWriter(iw);
    return 1;
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
@ -30,14 +30,16 @@ import java.io.IOException;
 * Open an index writer.
 * <br>Other side effects: index writer object in perfRunData is set.
 * <br>Relevant properties: <code>merge.factor, max.buffered,
- * max.field.length</code>.
+ * max.field.length, ram.flush.mb [default 0], autocommit
-</code>.
+ * [default true]</code>.
 */
 public class OpenIndexTask extends PerfTask {
  public static final int DEFAULT_MAX_BUFFERED = 10;
  public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
  public static final int DEFAULT_MERGE_PFACTOR = 10;
  public static final int DEFAULT_RAM_FLUSH_MB = 0;
  public static final boolean DEFAULT_AUTO_COMMIT = true;
  public OpenIndexTask(PerfRunData runData) {
    super(runData);
@ -46,7 +48,6 @@ public class OpenIndexTask extends PerfTask {
  public int doLogic() throws IOException {
    Directory dir = getRunData().getDirectory();
    Analyzer analyzer = getRunData().getAnalyzer();
    IndexWriter writer = new IndexWriter(dir, analyzer, false);
    Config config = getRunData().getConfig();
@ -54,12 +55,17 @@ public class OpenIndexTask extends PerfTask {
    int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
    int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
    int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);
    double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
    boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
    IndexWriter writer = new IndexWriter(dir, autoCommit, analyzer, false);
    // must update params for newly opened writer
    writer.setMaxBufferedDocs(mxbf);
    writer.setMaxFieldLength(mxfl);
    writer.setMergeFactor(mrgf);
    writer.setUseCompoundFile(cmpnd); // this one redundant?
    if (flushAtRAMUsage > 0)
      writer.setRAMBufferSizeMB(flushAtRAMUsage);
    getRunData().setIndexWriter(writer);
    return 1;
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
@ -0,0 +1,137 @@
 package org.apache.lucene.benchmark.byTask.tasks;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.BufferedWriter;
 import java.io.FileWriter;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 public class WriteLineDocTask extends PerfTask {
  /**
   * Default value for property <code>doc.add.log.step<code> - indicating how often 
   * an "added N docs" message should be logged.  
   */
  public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
  public WriteLineDocTask(PerfRunData runData) {
    super(runData);
  }
  private int logStep = -1;
  private int docSize = 0;
  int count = 0;
  private BufferedWriter lineFileOut=null;
  private DocMaker docMaker;
  public final static String SEP = "\t";
  /*
   *  (non-Javadoc)
   * @see PerfTask#setup()
   */
  public void setup() throws Exception {
    super.setup();
    if (lineFileOut==null) {
      Config config = getRunData().getConfig();
      String fileName = config.get("line.file.out", null);
      if (fileName == null)
        throw new Exception("line.file.out must be set");
      lineFileOut = new BufferedWriter(new FileWriter(fileName));
    }
    docMaker = getRunData().getDocMaker();
  }
  public void tearDown() throws Exception {
    log(++count);
    super.tearDown();
  }
  public int doLogic() throws Exception {
    Document doc;
    if (docSize > 0) {
      doc = docMaker.makeDocument(docSize);
    } else {
      doc = docMaker.makeDocument();
    }
    Field f = doc.getField(BasicDocMaker.BODY_FIELD);
    String body, title, date;
    if (f != null)
      body = f.stringValue().replace('\t', ' ');
    else
      body = null;
    f = doc.getField(BasicDocMaker.TITLE_FIELD);
    if (f != null)
      title = f.stringValue().replace('\t', ' ');
    else
      title = "";
    f = doc.getField(BasicDocMaker.DATE_FIELD);
    if (f != null)
      date = f.stringValue().replace('\t', ' ');
    else
      date = "";
    if (body != null) {
      lineFileOut.write(title, 0, title.length());
      lineFileOut.write(SEP);
      lineFileOut.write(date, 0, date.length());
      lineFileOut.write(SEP);
      lineFileOut.write(body, 0, body.length());
      lineFileOut.newLine();
      lineFileOut.flush();
    }
    return 1;
  }
  private void log (int count) {
    if (logStep<0) {
      // init once per instance
      logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
    }
    if (logStep>0 && (count%logStep)==0) {
      System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs");
    }
  }
  /**
   * Set the params (docSize only)
   * @param params docSize, or 0 for no limit.
   */
  public void setParams(String params) {
    super.setParams(params);
    docSize = (int) Float.parseFloat(params); 
  }
  /* (non-Javadoc)
   * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams()
   */
  public boolean supportsParams() {
    return true;
  }
 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
@ -22,6 +22,8 @@ import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Properties;
@ -110,7 +112,9 @@ public class Config {
  private void printProps() {
    System.out.println("------------> config properties:");
-    for (Iterator it = props.keySet().iterator(); it.hasNext();) {
+    List propKeys = new ArrayList(props.keySet());
    Collections.sort(propKeys);
    for (Iterator it = propKeys.iterator(); it.hasNext();) {
      String propName = (String) it.next();
      System.out.println(propName + " = " + props.getProperty(propName));
    }
--- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
+++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
@ -18,6 +18,9 @@
 package org.apache.lucene.benchmark.byTask;
 import java.io.StringReader;
 import java.io.File;
 import java.io.FileReader;
 import java.io.BufferedReader;
 import org.apache.lucene.benchmark.byTask.Benchmark;
 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
@ -79,6 +82,7 @@ public class TestPerfTasksLogic extends TestCase {
    iw.close();
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
    assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
    ir.close();
  }
  /**
@ -121,6 +125,7 @@ public class TestPerfTasksLogic extends TestCase {
    iw.close();
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
    assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
    ir.close();
  }
  /**
@ -150,6 +155,69 @@ public class TestPerfTasksLogic extends TestCase {
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
    int ndocsExpected = 21578; // that's how many docs there are in the Reuters collecton.
    assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
    ir.close();
  }
  /**
   * Test WriteLineDoc and LineDocMaker.
   */
  public void testLineDocFile() throws Exception {
    File lineFile = new File(System.getProperty("tempDir"), "test.reuters.lines.txt");
    // We will call WriteLineDocs this many times
    final int NUM_TRY_DOCS = 500;
    // Creates a line file with first 500 docs from reuters
    String algLines1[] = {
      "# ----- properties ",
      "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
      "doc.maker.forever=false",
      "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'),
      "# ----- alg ",
      "{WriteLineDoc()}:" + NUM_TRY_DOCS,
    };
    // Run algo
    Benchmark benchmark = execBenchmark(algLines1);
    // Verify we got somewhere between 1-500 lines (some
    // Reuters docs have no body, which WriteLineDoc task
    // skips).
    BufferedReader r = new BufferedReader(new FileReader(lineFile));
    int numLines = 0;
    while(r.readLine() != null)
      numLines++;
    r.close();
    assertTrue("did not see the right number of docs; should be > 0 and <= " + NUM_TRY_DOCS + " but was " + numLines, numLines > 0 && numLines <= NUM_TRY_DOCS);
    // Index the line docs
    String algLines2[] = {
      "# ----- properties ",
      "analyzer=org.apache.lucene.analysis.SimpleAnalyzer",
      "doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker",
      "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
      "doc.maker.forever=false",
      "autocommit=false",
      "ram.flush.mb=4",
      "# ----- alg ",
      "ResetSystemErase",
      "CreateIndex",
      "{AddDoc}: *",
      "CloseIndex",
    };
    // Run algo
    benchmark = execBenchmark(algLines2);
    // now we should be able to open the index for write. 
    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
    iw.close();
    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
    assertEquals(numLines + " lines were were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs());
    ir.close();
    lineFile.delete();
  }
  // create the benchmark and execute it.