LUCENE-2958: WriteLineDocTask improvements - flexible line fields definition - port/merge from 3x.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1083816 13f79535-47bb-0310-9956-ffa450edef68
2011-03-21 14:59:42 +00:00 · 2011-03-21 14:59:42 +00:00 · a9fda446c3
parent 99c13a4e29
commit a9fda446c3
6 changed files with 467 additions and 66 deletions
--- a/modules/benchmark/CHANGES.txt
+++ b/modules/benchmark/CHANGES.txt
@ -2,6 +2,13 @@ Lucene Benchmark Contrib Change Log

 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.

+03/21/2011
+  LUCENE-2958: WriteLineDocTask improvements - allow to emit line docs also for empty
+  docs, and be flexible about which fields are added to the line file. For this, a header
+  line was added to the line file. That header is examined by LineDocSource. Old line
+  files which have no header line are handled as before, imposing the default header.
+  (Doron Cohen, Shai Erera, Mike McCandless)
+  
 03/21/2011
  LUCENE-2964: Allow benchmark tasks from alternative packages,
  specified through a new property "alt.tasks.packages".
--- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
+++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
@ -22,6 +22,9 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.lang.reflect.Constructor;
+import java.util.Arrays;
+import java.util.Properties;

 import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
 import org.apache.lucene.benchmark.byTask.utils.Config;
@ -40,16 +43,136 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
 * <ul>
 * <li>docs.file=&lt;path to the file&gt;
 * <li>content.source.encoding - default to UTF-8.
+ * <li>line.parser - default to {@link HeaderLineParser} if a header line exists which differs 
+ *     from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise.
 * </ul>
 */
 public class LineDocSource extends ContentSource {

-  private final static char SEP = WriteLineDocTask.SEP;
+  /** Reader of a single input line into {@link DocData}. */
+  public static abstract class LineParser {
+    protected final String[] header;
+    /** Construct with the header 
+     * @param header header line found in the input file, or null if none
+     */
+    public LineParser(String[] header) {
+      this.header = header; 
+    }
+    /** parse an input line and fill doc data appropriately */
+    public abstract void parseLine(DocData docData, String line);
+  }
+  
+  /** 
+   * {@link LineParser} which ignores the header passed to its constructor
+   * and assumes simply that field names and their order are the same 
+   * as in {@link WriteLineDocTask#DEFAULT_FIELDS} 
+   */
+  public static class SimpleLineParser extends LineParser {
+    public SimpleLineParser(String[] header) {
+      super(header);
+    }
+    public void parseLine(DocData docData, String line) {
+      int k1 = 0;
+      int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
+      if (k2<0) {
+        throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
+      }
+      docData.setTitle(line.substring(k1,k2));
+      k1 = k2+1;
+      k2 = line.indexOf(WriteLineDocTask.SEP, k1);
+      if (k2<0) {
+        throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
+      }
+      docData.setDate(line.substring(k1,k2));
+      k1 = k2+1;
+      k2 = line.indexOf(WriteLineDocTask.SEP, k1);
+      if (k2>=0) {
+        throw new RuntimeException("line: [" + line + "] is in an invalid format (too many separators)!");
+      }
+      // last one
+      docData.setBody(line.substring(k1));
+    }
+  }
+  
+  /** 
+   * {@link LineParser} which sets field names and order by 
+   * the header - any header - of the lines file.
+   * It is less efficient than {@link SimpleLineParser} but more powerful.
+   */
+  public static class HeaderLineParser extends LineParser {
+    private enum FieldName { NAME , TITLE , DATE , BODY, PROP } 
+    private final FieldName[] posToF;
+    public HeaderLineParser(String[] header) {
+      super(header);
+      posToF = new FieldName[header.length];
+      for (int i=0; i<header.length; i++) {
+        String f = header[i];
+        if (DocMaker.NAME_FIELD.equals(f)) {
+          posToF[i] = FieldName.NAME;
+        } else if (DocMaker.TITLE_FIELD.equals(f)) {
+          posToF[i] = FieldName.TITLE;
+        } else if (DocMaker.DATE_FIELD.equals(f)) {
+          posToF[i] = FieldName.DATE;
+        } else if (DocMaker.BODY_FIELD.equals(f)) {
+          posToF[i] = FieldName.BODY;
+        } else {
+          posToF[i] = FieldName.PROP;
+        }
+      }
+    }
+    
+    public void parseLine(DocData docData, String line) {
+      int n = 0;
+      int k1 = 0;
+      int k2;
+      while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) {
+        if (n>=header.length) {
+          throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
+        }
+        setDocDataField(docData, n, line.substring(k1,k2));
+        ++n;
+        k1 = k2 + 1;
+      }
+      if (n!=header.length-1) {
+        throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
+      }
+      // last one
+      setDocDataField(docData, n, line.substring(k1)); 
+    }

+    private void setDocDataField(DocData docData, int position, String text) {
+      switch(posToF[position]) {
+        case NAME: 
+          docData.setName(text);
+          break;
+        case TITLE: 
+          docData.setTitle(text);
+          break;
+        case DATE: 
+          docData.setDate(text);
+          break;
+        case BODY: 
+          docData.setBody(text);
+          break;
+        case PROP:
+          Properties p = docData.getProps();
+          if (p==null) {
+            p = new Properties();
+            docData.setProps(p);
+          }
+          p.setProperty(header[position], text);
+          break;
+      }
+    }
+  }
+  
  private File file;
  private BufferedReader reader;
  private int readCount;

+  private LineParser docDataLineReader = null;
+  private boolean skipHeaderLine = false;
+
  private synchronized void openFile() {
    try {
      if (reader != null) {
@ -57,6 +180,9 @@ public class LineDocSource extends ContentSource {
      }
      InputStream is = getInputStream(file);
      reader = new BufferedReader(new InputStreamReader(is, encoding), BUFFER_SIZE);
+      if (skipHeaderLine) {
+        reader.readLine(); // skip one line - the header line - already handled that info
+      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
@ -77,7 +203,6 @@ public class LineDocSource extends ContentSource {
    
    synchronized(this) {
      line = reader.readLine();
-      myID = readCount++;
      if (line == null) {
        if (!forever) {
          throw new NoMoreDataException();
@ -86,27 +211,54 @@ public class LineDocSource extends ContentSource {
        openFile();
        return getNextDocData(docData);
      }
+      if (docDataLineReader == null) { // first line ever, one time initialization,
+        docDataLineReader = createDocDataLineReader(line);
+        if (skipHeaderLine) {
+          return getNextDocData(docData);
+        }
+      }
+      // increment IDS only once...
+      myID = readCount++; 
    }
    
-    // A line must be in the following format. If it's not, fail !
-    // title <TAB> date <TAB> body <NEWLINE>
-    int spot = line.indexOf(SEP);
-    if (spot == -1) {
-      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
-    }
-    int spot2 = line.indexOf(SEP, 1 + spot);
-    if (spot2 == -1) {
-      throw new RuntimeException("line: [" + line + "] is in an invalid format !");
-    }
    // The date String was written in the format of DateTools.dateToString.
    docData.clear();
    docData.setID(myID);
-    docData.setBody(line.substring(1 + spot2, line.length()));
-    docData.setTitle(line.substring(0, spot));
-    docData.setDate(line.substring(1 + spot, spot2));
+    docDataLineReader.parseLine(docData, line);
    return docData;
  }

+  private LineParser createDocDataLineReader(String line) {
+    String[] header;
+    String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;
+
+    if (line.startsWith(headIndicator)) {
+      header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP));
+      skipHeaderLine = true; // mark to skip the header line when input file is reopened
+    } else {
+      header = WriteLineDocTask.DEFAULT_FIELDS;
+    }
+    
+    // if a specific DocDataLineReader was configured, must respect it
+    String docDataLineReaderClassName = getConfig().get("line.parser", null);
+    if (docDataLineReaderClassName!=null) {
+      try {
+        final Class<? extends LineParser> clazz = 
+          Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class);
+        Constructor<? extends LineParser> cnstr = clazz.getConstructor(new Class[]{String[].class});
+        return cnstr.newInstance((Object)header);
+      } catch (Exception e) {
+        throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e);
+      }
+    }
+
+    // if this the simple case,   
+    if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) {
+      return new SimpleLineParser(header);
+    }
+    return new HeaderLineParser(header);
+  }
+
  @Override
  public void resetInputs() throws IOException {
    super.resetInputs();
--- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
+++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
@ -23,6 +23,8 @@ import java.io.FileOutputStream;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

@ -41,11 +43,17 @@ import org.apache.lucene.document.Field;
 * to save the IO overhead of opening a file per document to be indexed.<br>
 * Supports the following parameters:
 * <ul>
- * <li>line.file.out - the name of the file to write the output to. That
+ * <li><b>line.file.out<b> - the name of the file to write the output to. That
 * parameter is mandatory. <b>NOTE:</b> the file is re-created.
- * <li>bzip.compression - whether the output should be bzip-compressed. This is
- * recommended when the output file is expected to be large. (optional, default:
- * false).
+ * <li><b>bzip.compression<b> - whether the output should be bzip-compressed. This is
+ * recommended when the output file is expected to be large. 
+ * <li><b>line.fields<b> - which fields should be written in each line.
+ * (optional, default: {@link #DEFAULT_FIELDS}).
+ * <li><b>sufficient.fields</b> - list of field names, separated by comma, which, 
+ * if all of them are missing, the document will be skipped. For example, to require 
+ * that at least one of f1,f2 is not empty, specify: "f1,f2" in this field. To specify
+ * that no field is required, i.e. that even empty docs should be emitted, specify <b>","</b>.    
+ * (optional, default: {@link #DEFAULT_SUFFICIENT_FIELDS}).
 * </ul>
 * <b>NOTE:</b> this class is not thread-safe and if used by multiple threads the
 * output is unspecified (as all will write to the same output file in a
@ -53,13 +61,32 @@ import org.apache.lucene.document.Field;
 */
 public class WriteLineDocTask extends PerfTask {

+  public static final String FIELDS_HEADER_INDICATOR = "FIELDS_HEADER_INDICATOR###";
+
  public final static char SEP = '\t';
  
+  /**
+   * Fields to be written by default
+   */
+  public static final String[] DEFAULT_FIELDS = new String[] {
+    DocMaker.TITLE_FIELD,
+    DocMaker.DATE_FIELD,
+    DocMaker.BODY_FIELD,
+  };
+  
+  /**
+   * Default fields which at least one of them is required to not skip the doc.
+   */
+  public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD;
+  
  private int docSize = 0;
  private PrintWriter lineFileOut = null;
  private DocMaker docMaker;
  private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
  private ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>();
+  private final String[] fieldsToWrite;;
+  private final boolean[] sufficientFields;
+  private final boolean checkSufficientFields;
  
  public WriteLineDocTask(PerfRunData runData) throws Exception {
    super(runData);
@ -89,6 +116,51 @@ public class WriteLineDocTask extends PerfTask {
    }
    lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16));
    docMaker = runData.getDocMaker();
+    
+    // init fields 
+    String f2r = config.get("line.fields",null);
+    if (f2r == null) {
+      fieldsToWrite = DEFAULT_FIELDS;
+    } else {
+      if (f2r.indexOf(SEP)>=0) {
+        throw new IllegalArgumentException("line.fields "+f2r+" should not contain the separator char: "+SEP);
+      }
+      fieldsToWrite = f2r.split(","); 
+    }
+    
+    // init sufficient fields
+    sufficientFields = new boolean[fieldsToWrite.length];
+    String suff = config.get("sufficient.fields",DEFAULT_SUFFICIENT_FIELDS);
+    if (",".equals(suff)) {
+      checkSufficientFields = false;
+    } else {
+      checkSufficientFields = true;
+      HashSet<String> sf = new HashSet<String>(Arrays.asList(suff.split(",")));
+      for (int i=0; i<fieldsToWrite.length; i++) {
+        if (sf.contains(fieldsToWrite[i])) {
+          sufficientFields[i] = true;
+        }
+      }
+    }
+    
+    writeHeader();
+  }
+
+  /**
+   * Write a header to the lines file - indicating how to read the file later 
+   */
+  private void writeHeader() {
+    StringBuilder sb = threadBuffer.get();
+    if (sb == null) {
+      sb = new StringBuilder();
+      threadBuffer.set(sb);
+    }
+    sb.setLength(0);
+    sb.append(FIELDS_HEADER_INDICATOR);
+    for (String f : fieldsToWrite) {
+      sb.append(SEP).append(f);
+    }
+    lineFileOut.println(sb.toString());
  }

  @Override
@ -106,27 +178,26 @@ public class WriteLineDocTask extends PerfTask {
      threadNormalizer.set(matcher);
    }
    
-    Field f = doc.getField(DocMaker.BODY_FIELD);
-    String body = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
-    
-    f = doc.getField(DocMaker.TITLE_FIELD);
-    String title = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
-    
-    if (body.length() > 0 || title.length() > 0) {
-      
-      f = doc.getField(DocMaker.DATE_FIELD);
-      String date = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
-      
-      StringBuilder sb = threadBuffer.get();
-      if (sb == null) {
-        sb = new StringBuilder();
-        threadBuffer.set(sb);
-      }
-      sb.setLength(0);
-      sb.append(title).append(SEP).append(date).append(SEP).append(body);
+    StringBuilder sb = threadBuffer.get();
+    if (sb == null) {
+      sb = new StringBuilder();
+      threadBuffer.set(sb);
+    }
+    sb.setLength(0);
+
+    boolean sufficient = !checkSufficientFields;
+    for (int i=0; i<fieldsToWrite.length; i++) {
+      Field f = doc.getField(fieldsToWrite[i]);
+      String text = f == null ? "" : matcher.reset(f.stringValue()).replaceAll(" ").trim();
+      sb.append(text).append(SEP);
+      sufficient |= text.length()>0 && sufficientFields[i];
+    }
+    if (sufficient) {
+      sb.setLength(sb.length()-1); // remove redundant last separator
      // lineFileOut is a PrintWriter, which synchronizes internally in println.
      lineFileOut.println(sb.toString());
    }
+
    return 1;
  }

--- a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
+++ b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
@ -36,6 +36,7 @@ import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
 import org.apache.lucene.benchmark.byTask.stats.TaskStats;
 import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
 import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
 import org.apache.lucene.collation.CollationKeyAnalyzer;
 import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.FieldsEnum;
@ -393,8 +394,13 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {

    BufferedReader r = new BufferedReader(new FileReader(lineFile));
    int numLines = 0;
-    while(r.readLine() != null)
+    String line;
+    while((line = r.readLine()) != null) {
+      if (numLines==0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) {
+        continue; // do not count the header line as a doc 
+      }
      numLines++;
+    }
    r.close();
    assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines, NUM_TRY_DOCS, numLines);
    
--- a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java
+++ b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java
@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileOutputStream;
+import java.io.IOException;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.util.Properties;
@ -28,6 +29,8 @@ import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.benchmark.BenchmarkTestCase;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.HeaderLineParser;
+import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.LineParser;
 import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
 import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
 import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
@ -44,29 +47,71 @@ public class LineDocSourceTest extends BenchmarkTestCase {

  private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();

-  private void createBZ2LineFile(File file) throws Exception {
+  private void createBZ2LineFile(File file, boolean addHeader) throws Exception {
    OutputStream out = new FileOutputStream(file);
    out = csFactory.createCompressorOutputStream("bzip2", out);
    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
-    StringBuilder doc = new StringBuilder();
-    doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
-    writer.write(doc.toString());
-    writer.newLine();
+    writeDocsToFile(writer, addHeader, null);
    writer.close();
  }

-  private void createRegularLineFile(File file) throws Exception {
-    OutputStream out = new FileOutputStream(file);
-    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+  private void writeDocsToFile(BufferedWriter writer, boolean addHeader, Properties otherFields) throws IOException {
+    if (addHeader) {
+      writer.write(WriteLineDocTask.FIELDS_HEADER_INDICATOR);
+      writer.write(WriteLineDocTask.SEP);
+      writer.write(DocMaker.TITLE_FIELD);
+      writer.write(WriteLineDocTask.SEP);
+      writer.write(DocMaker.DATE_FIELD);
+      writer.write(WriteLineDocTask.SEP);
+      writer.write(DocMaker.BODY_FIELD);
+      if (otherFields!=null) {
+        // additional field names in the header 
+        for (Object fn : otherFields.keySet()) {
+          writer.write(WriteLineDocTask.SEP);
+          writer.write(fn.toString());
+        }
+      }
+      writer.newLine();
+    }
    StringBuilder doc = new StringBuilder();
-    doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
+    doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD);
+    if (otherFields!=null) {
+      // additional field values in the doc line 
+      for (Object fv : otherFields.values()) {
+        doc.append(WriteLineDocTask.SEP).append(fv.toString());
+      }
+    }
    writer.write(doc.toString());
    writer.newLine();
+  }
+
+  private void createRegularLineFile(File file, boolean addHeader) throws Exception {
+    OutputStream out = new FileOutputStream(file);
+    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+    writeDocsToFile(writer, addHeader, null);
+    writer.close();
+  }
+
+  private void createRegularLineFileWithMoreFields(File file, String...extraFields) throws Exception {
+    OutputStream out = new FileOutputStream(file);
+    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+    Properties p = new Properties();
+    for (String f : extraFields) {
+      p.setProperty(f, f);
+    }
+    writeDocsToFile(writer, true, p);
    writer.close();
  }
  
  private void doIndexAndSearchTest(File file, boolean setBZCompress,
-      String bz2CompressVal) throws Exception {
+      String bz2CompressVal, Class<? extends LineParser> lineParserClass, String storedField) throws Exception {
+    doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 1, storedField); // no extra repetitions
+    doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 2, storedField); // 1 extra repetition
+    doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 4, storedField); // 3 extra repetitions
+  }
+  
+  private void doIndexAndSearchTestWithRepeats(File file, boolean setBZCompress,
+      String bz2CompressVal, Class<? extends LineParser> lineParserClass, int numAdds, String storedField) throws Exception {

    Properties props = new Properties();
    
@ -75,11 +120,16 @@ public class LineDocSourceTest extends BenchmarkTestCase {
    if (setBZCompress) {
      props.setProperty("bzip.compression", bz2CompressVal);
    }
+    if (lineParserClass != null) {
+      props.setProperty("line.parser", lineParserClass.getName());
+    }
    
    // Indexing configuration.
    props.setProperty("analyzer", MockAnalyzer.class.getName());
    props.setProperty("content.source", LineDocSource.class.getName());
    props.setProperty("directory", "RAMDirectory");
+    props.setProperty("doc.stored", "true");
+    props.setProperty("doc.index.props", "true");
    
    // Create PerfRunData
    Config config = new Config(props);
@ -87,34 +137,60 @@ public class LineDocSourceTest extends BenchmarkTestCase {

    TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
    tasks.addTask(new CreateIndexTask(runData));
-    tasks.addTask(new AddDocTask(runData));
+    for (int i=0; i<numAdds; i++) {
+      tasks.addTask(new AddDocTask(runData));
+    }
    tasks.addTask(new CloseIndexTask(runData));
    tasks.doLogic();
    
    IndexSearcher searcher = new IndexSearcher(runData.getDirectory(), true);
    TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
-    assertEquals(1, td.totalHits);
+    assertEquals(numAdds, td.totalHits);
    assertNotNull(td.scoreDocs[0]);
+    
+    if (storedField==null) {
+      storedField = DocMaker.BODY_FIELD; // added to all docs and satisfies field-name == value
+    }
+    assertEquals("Wrong field value", storedField, searcher.doc(0).get(storedField));
+
    searcher.close();
  }
  
  /* Tests LineDocSource with a bzip2 input stream. */
  public void testBZip2() throws Exception {
    File file = new File(getWorkDir(), "one-line.bz2");
-    createBZ2LineFile(file);
-    doIndexAndSearchTest(file, true, "true");
+    createBZ2LineFile(file,true);
+    doIndexAndSearchTest(file, true, "true", null, null);
+  }
+
+  public void testBZip2NoHeaderLine() throws Exception {
+    File file = new File(getWorkDir(), "one-line.bz2");
+    createBZ2LineFile(file,false);
+    doIndexAndSearchTest(file, true, "true", null, null);
  }
  
  public void testBZip2AutoDetect() throws Exception {
    File file = new File(getWorkDir(), "one-line.bz2");
-    createBZ2LineFile(file);
-    doIndexAndSearchTest(file, false, null);
+    createBZ2LineFile(file,false);
+    doIndexAndSearchTest(file, false, null, null, null);
  }
  
  public void testRegularFile() throws Exception {
    File file = new File(getWorkDir(), "one-line");
-    createRegularLineFile(file);
-    doIndexAndSearchTest(file, false, null);
+    createRegularLineFile(file,true);
+    doIndexAndSearchTest(file, false, null, null, null);
+  }
+
+  public void testRegularFileSpecialHeader() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    createRegularLineFile(file,true);
+    doIndexAndSearchTest(file, false, null, HeaderLineParser.class, null);
+  }
+
+  public void testRegularFileNoHeaderLine() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    createRegularLineFile(file,false);
+    doIndexAndSearchTest(file, false, null, null, null);
  }

  public void testInvalidFormat() throws Exception {
@ -134,7 +210,7 @@ public class LineDocSourceTest extends BenchmarkTestCase {
      writer.newLine();
      writer.close();
      try {
-        doIndexAndSearchTest(file, false, null);
+        doIndexAndSearchTest(file, false, null, null, null);
        fail("Some exception should have been thrown for: [" + testCases[i] + "]");
      } catch (Exception e) {
        // expected.
@ -142,4 +218,19 @@ public class LineDocSourceTest extends BenchmarkTestCase {
    }
  }
  
+  /** Doc Name is not part of the default header */
+  public void testWithDocsName()  throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD);
+    doIndexAndSearchTest(file, false, null, null, DocMaker.NAME_FIELD);
+  }
+
+  /** Use fields names that are not defined in Docmaker and so will go to Properties */
+  public void testWithProperties()  throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    String specialField = "mySpecialField";
+    createRegularLineFileWithMoreFields(file, specialField);
+    doIndexAndSearchTest(file, false, null, null, specialField);
+  }
+  
 }
--- a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
+++ b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
@ -98,6 +98,25 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
      return doc;
    }
  }
+
+  // class has to be public so that Class.forName.newInstance() will work
+  // same as JustDate just that this one is treated as legal
+  public static final class LegalJustDateDocMaker extends DocMaker {
+    @Override
+    public Document makeDocument() throws Exception {
+      Document doc = new Document();
+      doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      return doc;
+    }
+  }
+
+  // class has to be public so that Class.forName.newInstance() will work
+  public static final class EmptyDocMaker extends DocMaker {
+    @Override
+    public Document makeDocument() throws Exception {
+      return new Document();
+    }
+  }
  
  // class has to be public so that Class.forName.newInstance() will work
  public static final class ThreadingDocMaker extends DocMaker {
@ -117,6 +136,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
  private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();

  private PerfRunData createPerfRunData(File file, boolean setBZCompress,
+                                        boolean allowEmptyDocs,
                                        String bz2CompressVal,
                                        String docMakerName) throws Exception {
    Properties props = new Properties();
@ -126,6 +146,13 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
      props.setProperty("bzip.compression", bz2CompressVal);
    }
    props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
+    if (allowEmptyDocs) {
+      props.setProperty("sufficient.fields", ",");
+    }
+    if (docMakerName.equals(LegalJustDateDocMaker.class.getName())) {
+      props.setProperty("line.fields", DocMaker.DATE_FIELD);
+      props.setProperty("sufficient.fields", DocMaker.DATE_FIELD);
+    }
    Config config = new Config(props);
    return new PerfRunData(config);
  }
@ -139,6 +166,8 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
    BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
    try {
      String line = br.readLine();
+      assertHeaderLine(line);
+      line = br.readLine();
      assertNotNull(line);
      String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
      int numExpParts = expBody == null ? 2 : 3;
@ -153,13 +182,17 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
      br.close();
    }
  }
+
+  private void assertHeaderLine(String line) {
+    assertTrue("First line should be a header line",line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR));
+  }
  
  /* Tests WriteLineDocTask with a bzip2 format. */
  public void testBZip2() throws Exception {
    
    // Create a document in bz2 format.
    File file = new File(getWorkDir(), "one-line.bz2");
-    PerfRunData runData = createPerfRunData(file, true, "true", WriteLineDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, true, false, "true", WriteLineDocMaker.class.getName());
    WriteLineDocTask wldt = new WriteLineDocTask(runData);
    wldt.doLogic();
    wldt.close();
@ -171,7 +204,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
    
    // Create a document in bz2 format.
    File file = new File(getWorkDir(), "one-line.bz2");
-    PerfRunData runData = createPerfRunData(file, false, null, WriteLineDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, WriteLineDocMaker.class.getName());
    WriteLineDocTask wldt = new WriteLineDocTask(runData);
    wldt.doLogic();
    wldt.close();
@ -183,7 +216,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
    
    // Create a document in regular format.
    File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, true, "false", WriteLineDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, true, false, "false", WriteLineDocMaker.class.getName());
    WriteLineDocTask wldt = new WriteLineDocTask(runData);
    wldt.doLogic();
    wldt.close();
@ -196,7 +229,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
    // separator char. However, it didn't replace newline characters, which
    // resulted in errors in LineDocSource.
    File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, NewLinesDocMaker.class.getName());
    WriteLineDocTask wldt = new WriteLineDocTask(runData);
    wldt.doLogic();
    wldt.close();
@ -209,7 +242,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
    // had a TITLE element (LUCENE-1755). It should throw away documents if they
    // don't have BODY nor TITLE
    File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, false, null, NoBodyDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, NoBodyDocMaker.class.getName());
    WriteLineDocTask wldt = new WriteLineDocTask(runData);
    wldt.doLogic();
    wldt.close();
@ -219,7 +252,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
  
  public void testEmptyTitle() throws Exception {
    File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, false, null, NoTitleDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, NoTitleDocMaker.class.getName());
    WriteLineDocTask wldt = new WriteLineDocTask(runData);
    wldt.doLogic();
    wldt.close();
@ -227,9 +260,10 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
    doReadTest(file, false, "", "date", "body");
  }
  
+  /** Fail by default when there's only date */
  public void testJustDate() throws Exception {
    File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, false, null, JustDateDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, JustDateDocMaker.class.getName());
    WriteLineDocTask wldt = new WriteLineDocTask(runData);
    wldt.doLogic();
    wldt.close();
@ -237,15 +271,53 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
    try {
      String line = br.readLine();
+      assertHeaderLine(line);
+      line = br.readLine();
      assertNull(line);
    } finally {
      br.close();
    }
  }

+  public void testLegalJustDate() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    PerfRunData runData = createPerfRunData(file, false, false, null, LegalJustDateDocMaker.class.getName());
+    WriteLineDocTask wldt = new WriteLineDocTask(runData);
+    wldt.doLogic();
+    wldt.close();
+    
+    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
+    try {
+      String line = br.readLine();
+      assertHeaderLine(line);
+      line = br.readLine();
+      assertNotNull(line);
+    } finally {
+      br.close();
+    }
+  }
+
+  public void testEmptyDoc() throws Exception {
+    File file = new File(getWorkDir(), "one-line");
+    PerfRunData runData = createPerfRunData(file, false, true, null, EmptyDocMaker.class.getName());
+    WriteLineDocTask wldt = new WriteLineDocTask(runData);
+    wldt.doLogic();
+    wldt.close();
+    
+    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
+    try {
+      String line = br.readLine();
+      assertHeaderLine(line);
+      line = br.readLine();
+      assertNotNull(line);
+    } finally {
+      br.close();
+    }
+  }
+
  public void testMultiThreaded() throws Exception {
    File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, false, null, ThreadingDocMaker.class.getName());
+    PerfRunData runData = createPerfRunData(file, false, false, null, ThreadingDocMaker.class.getName());
    final WriteLineDocTask wldt = new WriteLineDocTask(runData);
    Thread[] threads = new Thread[10];
    for (int i = 0; i < threads.length; i++) {
@ -269,8 +341,10 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
    Set<String> ids = new HashSet<String>();
    BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
    try {
+      String line = br.readLine();
+      assertHeaderLine(line); // header line is written once, no matter how many threads there are
      for (int i = 0; i < threads.length; i++) {
-        String line = br.readLine();
+        line = br.readLine();
        String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
        assertEquals(3, parts.length);
        // check that all thread names written are the same in the same line