LUCENE-849: configurable HTML Parser; external classes; exhaustive doc maker - '*';

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@522569 13f79535-47bb-0310-9956-ffa450edef68
2007-03-26 16:46:33 +00:00 · 2007-03-26 16:46:33 +00:00 · 031f50c4e7
parent eee9d52886
commit 031f50c4e7
17 changed files with 513 additions and 106 deletions
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@ -4,6 +4,13 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety

 $Id:$

+3/25/07
+
+LUCENE-849: 
+1. which HTML Parser is used is configurable with html.parser property.
+2. External classes added to classpath with -Dbenchmark.ext.classpath=path.
+3. '*' as repeating number now means "exhaust doc maker - no repetitions".
+
 3/22/07

 -Moved withRetrieve() call out of the loop in ReadTask
--- a/contrib/benchmark/build.xml
+++ b/contrib/benchmark/build.xml
@ -97,6 +97,7 @@
    <path id="run.classpath">
        <path refid="classpath"/>
        <pathelement location="${build.dir}/classes/java"/>
+        <pathelement location="${benchmark.ext.classpath}"/>
    </path>

    <target name="run-standard" depends="compile,check-files,get-files" description="Run the standard baseline">
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
@ -52,6 +52,7 @@ public class Benchmark {
    try {
      runData = new PerfRunData(new Config(algReader));
    } catch (Exception e) {
+      e.printStackTrace();
      throw new Exception("Error: cannot init PerfRunData!",e);
    }
    
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
@ -23,6 +23,7 @@ import java.util.Iterator;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.HTMLParser;
 import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
 import org.apache.lucene.benchmark.byTask.stats.Points;
 import org.apache.lucene.benchmark.byTask.tasks.ReadTask;
@ -58,6 +59,7 @@ public class PerfRunData {
  private Directory directory;
  private Analyzer analyzer;
  private DocMaker docMaker;
+  private HTMLParser htmlParser;
  
  // we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
  private HashMap readTaskQueryMaker;
@ -79,7 +81,10 @@ public class PerfRunData {
    docMaker.setConfig(config);
    // query makers
    readTaskQueryMaker = new HashMap();
-    qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker"));
+    qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker"));
+    // html parser, used for some doc makers
+    htmlParser = (HTMLParser) Class.forName(config.get("html.parser","org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser")).newInstance();
+    docMaker.setHTMLParser(htmlParser);

    // index stuff
    reinit(false);
@ -229,4 +234,11 @@ public class PerfRunData {
    return qm;
  }

+  /**
+   * @return Returns the htmlParser.
+   */
+  public HTMLParser getHtmlParser() {
+    return htmlParser;
+  }
+
 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
@ -26,9 +26,7 @@ import org.apache.lucene.document.Field;
 import java.io.File;
 import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
-import java.util.Date;
 import java.util.Iterator;
-import java.util.Properties;


 /**
@ -47,15 +45,8 @@ public abstract class BasicDocMaker implements DocMaker {
  
  private int numDocsCreated = 0;
  private boolean storeBytes = false;
+  protected boolean forever;

-  static class DocData {
-    String name;
-    Date date;
-    String title;
-    String body;
-    Properties props;
-  }
-  
  private static class LeftOver {
    private DocData docdata;
    private int cnt;
@ -80,10 +71,14 @@ public abstract class BasicDocMaker implements DocMaker {

  /**
   * Return the data of the next document.
+   * All current implementations can create docs forever. 
+   * When the input data is exhausted, input files are iterated.
+   * This re-iteration can be avoided by setting doc.maker.forever to false (default is true).
   * @return data of the next document.
   * @exception if cannot create the next doc data
+   * @exception NoMoreDataException if data is exhausted (and 'forever' set to false).
   */
-  protected abstract DocData getNextDocData() throws Exception;
+  protected abstract DocData getNextDocData() throws NoMoreDataException, Exception;

  /*
   *  (non-Javadoc)
@ -103,32 +98,32 @@ public abstract class BasicDocMaker implements DocMaker {
    int docid = incrNumDocsCreated();
    Document doc = new Document();
    doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
-    if (docData.name!=null) {
-      String name = (cnt<0 ? docData.name : docData.name+"_"+cnt);
+    if (docData.getName()!=null) {
+      String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
      doc.add(new Field("docname", name, storeVal, indexVal, termVecVal));
    }
-    if (docData.date!=null) {
-      String dateStr = DateTools.dateToString(docData.date, DateTools.Resolution.SECOND);
+    if (docData.getDate()!=null) {
+      String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
      doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal));
    }
-    if (docData.title!=null) {
-      doc.add(new Field("doctitle", docData.title, storeVal, indexVal, termVecVal));
+    if (docData.getTitle()!=null) {
+      doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal));
    }
-    if (docData.body!=null && docData.body.length()>0) {
+    if (docData.getBody()!=null && docData.getBody().length()>0) {
      String bdy;
-      if (size<=0 || size>=docData.body.length()) {
-        bdy = docData.body; // use all
-        docData.body = "";  // nothing left
+      if (size<=0 || size>=docData.getBody().length()) {
+        bdy = docData.getBody(); // use all
+        docData.setBody("");  // nothing left
      } else {
        // attempt not to break words - if whitespace found within next 20 chars...
-        for (int n=size-1; n<size+20 && n<docData.body.length(); n++) {
-          if (Character.isWhitespace(docData.body.charAt(n))) {
+        for (int n=size-1; n<size+20 && n<docData.getBody().length(); n++) {
+          if (Character.isWhitespace(docData.getBody().charAt(n))) {
            size = n;
            break;
          }
        }
-        bdy = docData.body.substring(0,size); // use part
-        docData.body = docData.body.substring(size); // some left
+        bdy = docData.getBody().substring(0,size); // use part
+        docData.setBody(docData.getBody().substring(size)); // some left
      }
      doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
      if (storeBytes == true) {
@ -136,13 +131,13 @@ public abstract class BasicDocMaker implements DocMaker {
      }
    }

-    if (docData.props!=null) {
-      for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {
+    if (docData.getProps()!=null) {
+      for (Iterator it = docData.getProps().keySet().iterator(); it.hasNext(); ) {
        String key = (String) it.next();
-        String val = (String) docData.props.get(key);
+        String val = (String) docData.getProps().get(key);
        doc.add(new Field(key, val, storeVal, indexVal, termVecVal));
      }
-      docData.props = null;
+      docData.setProps(null);
    }
    //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
    return doc;
@ -154,19 +149,19 @@ public abstract class BasicDocMaker implements DocMaker {
   */
  public Document makeDocument(int size) throws Exception {
    LeftOver lvr = (LeftOver) leftovr.get();
-    if (lvr==null || lvr.docdata==null || lvr.docdata.body==null || lvr.docdata.body.length()==0) {
+    if (lvr==null || lvr.docdata==null || lvr.docdata.getBody()==null || lvr.docdata.getBody().length()==0) {
      resetLeftovers();
    }
    DocData dd = (lvr==null ? getNextDocData() : lvr.docdata);
    int cnt = (lvr==null ? 0 : lvr.cnt);
-    while (dd.body==null || dd.body.length()<size) {
+    while (dd.getBody()==null || dd.getBody().length()<size) {
      DocData dd2 = dd;
      dd = getNextDocData();
      cnt = 0;
-      dd.body = dd2.body + dd.body;
+      dd.setBody(dd2.getBody() + dd.getBody());
    }
    Document doc = createDocument(dd,size,cnt);
-    if (dd.body==null || dd.body.length()==0) {
+    if (dd.getBody()==null || dd.getBody().length()==0) {
      resetLeftovers();
    } else {
      if (lvr == null) {
@ -195,6 +190,7 @@ public abstract class BasicDocMaker implements DocMaker {
    indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
    termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
    storeBytes = config.get("doc.store.body.bytes", false);
+    forever = config.get("doc.maker.forever",true);
  }

  /*
@ -247,6 +243,8 @@ public abstract class BasicDocMaker implements DocMaker {
  private int lastPrintedNumUniqueTexts = 0;
  private long lastPrintedNumUniqueBytes = 0;
  private int printNum = 0;
+  private HTMLParser htmlParser;
+  
  public void printDocStatistics() {
    boolean print = false;
    String col = "                  ";
@ -277,6 +275,7 @@ public abstract class BasicDocMaker implements DocMaker {
  }

  protected void collectFiles(File f, ArrayList inputFiles) {
+    //System.out.println("Collect: "+f.getAbsolutePath());
    if (!f.canRead()) {
      return;
    }
@ -291,5 +290,20 @@ public abstract class BasicDocMaker implements DocMaker {
    addUniqueBytes(f.length());
  }

+  /* (non-Javadoc)
+   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#setHTMLParser(org.apache.lucene.benchmark.byTask.feeds.HTMLParser)
+   */
+  public void setHTMLParser(HTMLParser htmlParser) {
+    this.htmlParser = htmlParser;
+  }
+
+  /*
+   *  (non-Javadoc)
+   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#getHtmlParser()
+   */
+  public HTMLParser getHtmlParser() {
+    return htmlParser;
+  }
+

 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
@ -0,0 +1,82 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.DateFormat;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+import java.util.Properties;
+
+/**
+ * HTML Parser that is based on Lucene's demo HTML parser.
+ */
+public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
+
+  DateFormat dateFormat;
+  
+  public DemoHTMLParser () {
+    dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US);  //Tue, 09 Dec 2003 22:39:08 GMT
+    dateFormat.setLenient(true);
+  }
+
+  /*
+   *  (non-Javadoc)
+   * @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.io.Reader, java.text.DateFormat)
+   */
+  public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
+    org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader);
+    
+    // title
+    String title = p.getTitle();
+    // properties 
+    Properties props = p.getMetaTags(); 
+    // body
+    Reader r = p.getReader();
+    char c[] = new char[1024];
+    StringBuffer bodyBuf = new StringBuffer();
+    int n;
+    while ((n = r.read(c)) >= 0) {
+      if (n>0) {
+        bodyBuf.append(c,0,n);
+      }
+    }
+    r.close();
+    if (date == null && props.getProperty("date")!=null) {
+      try {
+        date = dateFormat.parse(props.getProperty("date").trim());
+      } catch (ParseException e) {
+        // do not fail test just because a date could not be parsed
+        System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
+        date = new Date(); // now 
+      }
+    }
+      
+    return new DocData(name, bodyBuf.toString(), title, props, date);
+  }
+
+  public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException {
+    // TODO Auto-generated method stub
+    return parse(name, date, new StringReader(inputText.toString()), dateFormat);
+  }
+
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
@ -0,0 +1,113 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Date;
+import java.util.Properties;
+
+/**
+ * Output of parsing (e.g. HTML parsing) of an input document.
+ */
+
+public class DocData {
+  
+  private String name;
+  private String body;
+  private String title;
+  private Date date;
+  private Properties props;
+  
+  public DocData(String name, String body, String title, Properties props, Date date) {
+    this.name = name;
+    this.body = body;
+    this.title = title;
+    this.date = date;
+    this.props = props;
+  }
+
+  /**
+   * @return Returns the name.
+   */
+  public String getName() {
+    return name;
+  }
+
+  /**
+   * @param name The name to set.
+   */
+  public void setName(String name) {
+    this.name = name;
+  }
+
+  /**
+   * @return Returns the props.
+   */
+  public Properties getProps() {
+    return props;
+  }
+
+  /**
+   * @param props The props to set.
+   */
+  public void setProps(Properties props) {
+    this.props = props;
+  }
+
+  /**
+   * @return Returns the body.
+   */
+  public String getBody() {
+    return body;
+  }
+
+  /**
+   * @param body The body to set.
+   */
+  public void setBody(String body) {
+    this.body = body;
+  }
+
+  /**
+   * @return Returns the title.
+   */
+  public String getTitle() {
+    return title;
+  }
+
+  /**
+   * @param title The title to set.
+   */
+  public void setTitle(String title) {
+    this.title = title;
+  }
+
+  /**
+   * @return Returns the date.
+   */
+  public Date getDate() {
+    return date;
+  }
+
+  /**
+   * @param date The date to set.
+   */
+  public void setDate(Date date) {
+    this.date = date;
+  }
+
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
@ -61,4 +61,11 @@ public interface DocMaker {

  /** Print some statistics on docs available/added/etc. */ 
  public void printDocStatistics();
-}
+
+  /** Set the html parser to use, when appropriate */
+  public void setHTMLParser(HTMLParser htmlParser);
+  
+  /** Returns the htmlParser. */
+  public HTMLParser getHtmlParser();
+
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
@ -0,0 +1,51 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.text.DateFormat;
+import java.util.Date;
+
+/**
+ * HTML Parsing Interfacew for test purposes
+ */
+public interface HTMLParser {
+
+  /**
+   * Parse the input Reader and return DocData. 
+   * A provided name or date is used for the result, otherwise an attempt is 
+   * made to set them from the parsed data.
+   * @param dateFormat date formatter to use for extracting the date.   
+   * @param name name of the result doc data. If null, attempt to set by parsed data.
+   * @param date date of the result doc data. If null, attempt to set by parsed data.
+   * @param reader of html text to parse.
+   * @return Parsed doc data.
+   * @throws IOException
+   * @throws InterruptedException
+   */
+  public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
+  
+  /**
+   * Parse the inputText and return DocData. 
+   * @param inputText the html text to parse.
+   * @see #parse(String, Date, Reader, DateFormat)
+   */
+  public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException;
+
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/NoMoreDataException.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/NoMoreDataException.java
@ -0,0 +1,27 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Exception indicating there is no more data.
+ * Thrown by Docs Makers if doc.maker.forever is false and docs sources of that maker where exhausted.
+ * This is usefull for iterating all document of a source, in case we don't know in advance how many docs there are.
+ */
+public class NoMoreDataException extends Exception {
+
+}
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
@ -25,6 +25,7 @@ import java.io.FileReader;
 import java.text.DateFormat;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
+import java.util.Date;
 import java.util.Locale;


@ -66,13 +67,16 @@ public class ReutersDocMaker extends BasicDocMaker {
    File f = null;
    String name = null;
    synchronized (this) {
-      f = (File) inputFiles.get(nextFile++);
-      name = f.getCanonicalPath()+"_"+iteration;
      if (nextFile >= inputFiles.size()) { 
-        // exhausted files, start a new round
+        // exhausted files, start a new round, unless forever set to false.
+        if (!forever) {
+          throw new NoMoreDataException();
+        }
        nextFile = 0;
        iteration++;
      }
+      f = (File) inputFiles.get(nextFile++);
+      name = f.getCanonicalPath()+"_"+iteration;
    }
    
    BufferedReader reader = new BufferedReader(new FileReader(f));
@ -90,13 +94,9 @@ public class ReutersDocMaker extends BasicDocMaker {
    
    addBytes(f.length());

-    DocData dd = new DocData();
    
-    dd.date = dateFormat.parse(dateStr.trim());
-    dd.name = name;
-    dd.title = title;
-    dd.body = bodyBuf.toString();
-    return dd;
+    Date date = dateFormat.parse(dateStr.trim()); 
+    return new DocData(name, bodyBuf.toString(), title, null, date);
  }


--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleDocMaker.java
@ -18,7 +18,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
 */

 /**
- * Create documents for the test
+ * Create documents for the test.
 */
 public class SimpleDocMaker extends BasicDocMaker {
  
@ -58,12 +58,12 @@ public class SimpleDocMaker extends BasicDocMaker {
    return 0; // not applicable
  }

-  protected DocData getNextDocData() {
-    DocData dd = new DocData();
-    dd.body = DOC_TEXT;
-    dd.name = "doc"+newdocid();
+  protected DocData getNextDocData() throws NoMoreDataException {
+    if (docID>0 && !forever) {
+      throw new NoMoreDataException();
+    }
    addBytes(DOC_TEXT.length());
-    return dd;
+    return new DocData("doc"+newdocid(),DOC_TEXT, null, null, null);
  }

 }
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
@ -23,19 +23,15 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
-import java.io.Reader;
-import java.io.StringReader;
 import java.text.DateFormat;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.Locale;
-import java.util.Properties;
 import java.util.zip.GZIPInputStream;

 import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.demo.html.HTMLParser;


 /**
@ -45,7 +41,7 @@ public class TrecDocMaker extends BasicDocMaker {

  private static final String newline = System.getProperty("line.separator");
  
-  private DateFormat dateFormat;
+  private DateFormat dateFormat [];
  private File dataDir = null;
  private ArrayList inputFiles = new ArrayList();
  private int nextFile = 0;
@ -53,6 +49,13 @@ public class TrecDocMaker extends BasicDocMaker {
  private BufferedReader reader;
  private GZIPInputStream zis;
  
+  private static final String DATE_FORMATS [] = {
+    "EEE, dd MMM yyyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
+    "EEE MMM dd kk:mm:ss yyyy z",  //Tue Dec 09 16:45:08 2003 EST
+    "EEE, dd-MMM-':'y kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
+    "EEE, dd-MMM-yyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
+  };
+  
  /* (non-Javadoc)
   * @see SimpleDocMaker#setConfig(java.util.Properties)
   */
@ -65,34 +68,44 @@ public class TrecDocMaker extends BasicDocMaker {
      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
    }
    // date format: 30-MAR-1987 14:22:36.87
-    dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US);  //Tue, 09 Dec 2003 22:39:08 GMT
-    dateFormat.setLenient(true);
-  }
+    dateFormat = new SimpleDateFormat[DATE_FORMATS.length];
+    for (int i = 0; i < dateFormat.length; i++) {
+      dateFormat[i] = new SimpleDateFormat(DATE_FORMATS[i],Locale.US);
+      dateFormat[i].setLenient(true);
+    }
+ }

-  private void openNextFile() throws Exception {
+  private void openNextFile() throws NoMoreDataException, Exception {
    closeInputs();
    int retries = 0;
-    while (retries<20) {
+    while (true) {
      File f = null;
      synchronized (this) {
-        f = (File) inputFiles.get(nextFile++);
        if (nextFile >= inputFiles.size()) { 
-          // exhausted files, start a new round
+          // exhausted files, start a new round, unless forever set to false.
+          if (!forever) {
+            throw new NoMoreDataException();
+          }
          nextFile = 0;
          iteration++;
        }
+        f = (File) inputFiles.get(nextFile++);
      }
      System.out.println("opening: "+f+" length: "+f.length());
      try {
        zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f)));
-        break;
+        reader = new BufferedReader(new InputStreamReader(zis));
+        return;
      } catch (Exception e) {
        retries++;
-        System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+"  #retries="+retries);
-        continue;
+        if (retries<20) {
+          System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+"  #retries="+retries);
+          continue;
+        } else {
+          throw new NoMoreDataException();
+        }
      }
    }
-    reader = new BufferedReader(new InputStreamReader(zis));
  }

  private void closeInputs() {
@ -142,7 +155,7 @@ public class TrecDocMaker extends BasicDocMaker {
    return sb;
  }
  
-  protected DocData getNextDocData() throws Exception {
+  protected DocData getNextDocData() throws NoMoreDataException, Exception {
    if (reader==null) {
      openNextFile();
    }
@ -162,39 +175,27 @@ public class TrecDocMaker extends BasicDocMaker {
    // 6. collect until end of doc
    sb = read("</DOC>",null,false,true);
    // this is the next document, so parse it 
-    // TODO use a more robust html parser (current one aborts parsing quite easily). 
-    HTMLParser p = new HTMLParser(new StringReader(sb.toString()));
-    // title
-    String title = p.getTitle();
-    // properties 
-    Properties props = p.getMetaTags(); 
-    // body
-    Reader r = p.getReader();
-    char c[] = new char[1024];
-    StringBuffer bodyBuf = new StringBuffer();
-    int n;
-    while ((n = r.read(c)) >= 0) {
-      if (n>0) {
-        bodyBuf.append(c,0,n);
+    Date date = parseDate(dateStr);
+    HTMLParser p = getHtmlParser();
+    DocData docData = p.parse(name, date, sb, dateFormat[0]);
+    addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text). 
+    
+    return docData;
+  }
+
+  private Date parseDate(String dateStr) {
+    Date date = null;
+    for (int i=0; i<dateFormat.length; i++) {
+      try {
+        date = dateFormat[i].parse(dateStr.trim());
+        return date;
+      } catch (ParseException e) {
      }
    }
-    r.close();
-    addBytes(bodyBuf.length());
-    
-    DocData dd = new DocData();
-
-    try {
-      dd.date = dateFormat.parse(dateStr.trim());
-    } catch (ParseException e) {
-      // do not fail test just because a date could not be parsed
-      System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr);
-      dd.date = new Date(); // now 
-    }
-    dd.name = name;
-    dd.title = title;
-    dd.body = bodyBuf.toString();
-    dd.props = props;
-    return dd;
+    // do not fail test just because a date could not be parsed
+    System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr);
+    date = new Date(); // now 
+    return date;
  }


--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
@ -149,6 +149,19 @@ Assume you added the class "WonderfulTask" - doing so also enables the
 command "Wonderful" to be used in the algorithm.
 </p>

+<p>
+<u>External classes</u>: It is sometimes useful to invoke the benchmark
+package with your external alg file that configures the use of your own
+doc/query maker and or html parser. You can work this out without
+modifying the benchmark package code, by passing your class path
+with the benchmark.ext.classpath property:
+<ul>
+  <li>ant run-task -Dtask.alg=[full-path-to-your-alg-file]
+      <font color="#FF0000">-Dbenchmark.ext.classpath=/mydir/classes
+      </font> -Dtask.mem=512M</li>
+</ul>
+</p>
+
 <a name="algorithm"></a>
 <h2>Benchmark "algorithm"</h2>

@ -198,6 +211,14 @@ The following is an informal description of the supported syntax.
 30 times in a row.
 <br>Example -  <font color="#FF0066">{ AddDoc AddDoc } : 30</font> - would do
 addDoc 60 times in a row.
+ <br><b>Exhaustive repeating</b>: use <font color="#FF0066">*</font> instead of
+ a number to repeat forever.
+ This is sometimes useful, for adding as many files as a doc maker can create,
+ without iterating over the same files again, but in the case that the exact
+ number of files is not known in advance. For insance, TREC files extracted
+ from a zip file.
+ <br>Example -  <font color="#FF0066">{ AddDoc } : *</font>  - would add docs
+ until the doc maker is "exhausted".
 </li>
 <li>
 <b>Command parameter</b>: a command can optionally take a single parameter.
@ -487,6 +508,8 @@ Here is a list of currently defined properties:
  <li><b>Docs and queries creation:</b></li>
    <ul><li>analyzer
    </li><li>doc.maker
+    </li><li>doc.maker.forever
+    </li><li>html.parser
    </li><li>doc.stored
    </li><li>doc.tokenized
    </li><li>doc.term.vector
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
@ -21,11 +21,13 @@ import java.util.ArrayList;
 import java.util.Iterator;

 import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;

 /**
 * Sequence of parallel or sequential tasks.
 */
 public class TaskSequence extends PerfTask {
+  public static int REPEAT_EXHAUST = -2; 
  private ArrayList tasks;
  private int repetitions = 1;
  private boolean parallel;
@ -61,9 +63,13 @@ public class TaskSequence extends PerfTask {

  /**
   * @param repetitions The repetitions to set.
+   * @throws Exception 
   */
-  public void setRepetitions(int repetitions) {
+  public void setRepetitions(int repetitions) throws Exception {
    this.repetitions = repetitions;
+    if (repetitions==REPEAT_EXHAUST && isParallel()) {
+      throw new Exception("REPEAT_EXHAUST is not allowed for parallel tasks");
+    }
    setSequenceName();
  }

@ -88,10 +94,15 @@ public class TaskSequence extends PerfTask {
    }
    
    int count = 0;
-    for (int k=0; k<repetitions; k++) {
+    boolean exhausted = false;
+    for (int k=0; (repetitions==REPEAT_EXHAUST && !exhausted) || k<repetitions; k++) {
      for (Iterator it = tasks.iterator(); it.hasNext();) {
        PerfTask task = (PerfTask) it.next();
-        count += task.runAndMaybeStats(letChildReport);
+        try {
+          count += task.runAndMaybeStats(letChildReport);
+        } catch (NoMoreDataException e) {
+          exhausted = true;
+        }
      }
    }
    return count;
@ -101,7 +112,8 @@ public class TaskSequence extends PerfTask {
    long delayStep = (perMin ? 60000 : 1000) /rate;
    long nextStartTime = System.currentTimeMillis();
    int count = 0;
-    for (int k=0; k<repetitions; k++) {
+    boolean exhausted = false;
+    for (int k=0; (repetitions==REPEAT_EXHAUST && !exhausted) || k<repetitions; k++) {
      for (Iterator it = tasks.iterator(); it.hasNext();) {
        PerfTask task = (PerfTask) it.next();
        long waitMore = nextStartTime - System.currentTimeMillis();
@ -110,7 +122,11 @@ public class TaskSequence extends PerfTask {
          Thread.sleep(waitMore);
        }
        nextStartTime += delayStep; // this aims at avarage rate. 
-        count += task.runAndMaybeStats(letChildReport);
+        try {
+          count += task.runAndMaybeStats(letChildReport);
+        } catch (NoMoreDataException e) {
+          exhausted = true;
+        }
      }
    }
    return count;
@ -198,6 +214,9 @@ public class TaskSequence extends PerfTask {
    if (repetitions>1) {
      sb.append(" * " + repetitions);
    }
+    if (repetitions==REPEAT_EXHAUST) {
+      sb.append(" * EXHAUST");
+    }
    if (rate>0) {
      sb.append(",  rate: " + rate+"/"+(perMin?"min":"sec"));
    }
@ -237,7 +256,9 @@ public class TaskSequence extends PerfTask {

  private void setSequenceName() {
    seqName = super.getName();
-    if (repetitions>1) {
+    if (repetitions==REPEAT_EXHAUST) {
+      seqName += "_Exhaust";
+    } else if (repetitions>1) {
      seqName += "_"+repetitions;
    }
    if (rate>0) {
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
@ -117,8 +117,12 @@ public class Algorithm {
              colonOk = false;
              // get repetitions number
              stok.nextToken();
-              if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString());
-              ((TaskSequence)prevTask).setRepetitions((int)stok.nval); 
+              if ((char)stok.ttype == '*') {
+                ((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST);
+              } else {
+                if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString());
+                ((TaskSequence)prevTask).setRepetitions((int)stok.nval);
+              }
              // check for rate specification (ops/min)
              stok.nextToken();
              if (stok.ttype!=':') {
--- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
+++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
@ -81,6 +81,49 @@ public class TestPerfTasksLogic extends TestCase {
    assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
  }

+  /**
+   * Test Exhasting Doc Maker logic
+   */
+  public void testExhaustDocMaker() throws Exception {
+    // 1. alg definition (required in every "logic" test)
+    String algLines[] = {
+        "# ----- properties ",
+        "doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker",
+        "doc.add.log.step=1",
+        "doc.term.vector=false",
+        "doc.maker.forever=false",
+        "directory=RAMDirectory",
+        "doc.stored=false",
+        "doc.tokenized=false",
+        "# ----- alg ",
+        "CreateIndex",
+        "{ AddDoc } : * ",
+        "Optimize",
+        "CloseIndex",
+        "OpenReader",
+        "{ CountingSearchTest } : 100",
+        "CloseReader",
+        "[ CountingSearchTest > : 30",
+        "[ CountingSearchTest > : 9",
+    };
+    
+    // 2. we test this value later
+    CountingSearchTestTask.numSearches = 0;
+    
+    // 3. execute the algorithm  (required in every "logic" test)
+    Benchmark benchmark = execBenchmark(algLines);
+
+    // 4. test specific checks after the benchmark run completed.
+    assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches);
+    assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
+    // now we should be able to open the index for write. 
+    IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
+    iw.close();
+    IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
+    assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
+  }
+
+  
  // create the benchmark and execute it. 
  private Benchmark execBenchmark(String[] algLines) throws Exception {
    String algText = algLinesToText(algLines);