From 13eaccfd56bc3f1f5532e8e58b78cc8ecf9b80d4 Mon Sep 17 00:00:00 2001
From: Grant Ingersoll <gsingers@apache.org>
Date: Mon, 12 Feb 2007 13:32:20 +0000
Subject: [PATCH] Lucene 790

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@506441 13f79535-47bb-0310-9956-ffa450edef68
---
 .../benchmark/byTask/feeds/BasicDocMaker.java | 281 ++++++++++++++++++
 .../benchmark/byTask/feeds/TrecDocMaker.java  | 210 +++++++++++++
 2 files changed, 491 insertions(+)
 create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
 create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java

diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
new file mode 100644
index 00000000000..cc8ad4ffaec
--- /dev/null
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
@@ -0,0 +1,281 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.Properties;
+
+import org.apache.lucene.document.DateTools;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.utils.Format;
+
+
+/**
+ * Create documents for the test.
+ * Maintains counters of chars etc. so that sub-classes just need to 
+ * provide textual content, and the create-by-size is handled here.
+ */
+public abstract class BasicDocMaker implements DocMaker {
+  
+  private int numDocsCreated = 0;
+  
+  static class DocData {
+    String name;
+    Date date;
+    String title;
+    String body;
+    Properties props;
+  }
+  
+  private static class LeftOver {
+    private DocData docdata;
+    private int cnt;
+  }
+
+  // leftovers are thread local, because it is unsafe to share residues between threads
+  private ThreadLocal leftovr = new ThreadLocal();
+
+  static final String BODY_FIELD = "body";
+  private long numBytes = 0;
+  private long numUniqueBytes = 0;
+
+  protected Config config;
+
+  protected Field.Store storeVal = Field.Store.NO;
+  protected Field.Index indexVal = Field.Index.TOKENIZED;
+  protected Field.TermVector termVecVal = Field.TermVector.NO;
+  
+  private synchronized int incrNumDocsCreated() {
+    return numDocsCreated++;
+  }
+
+  /**
+   * Return the data of the next document.
+   * @return data of the next document.
+   * @exception if cannot create the next doc data
+   */
+  protected abstract DocData getNextDocData() throws Exception;
+
+  /*
+   *  (non-Javadoc)
+   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument()
+   */
+  public Document makeDocument () throws Exception {
+    resetLeftovers();
+    DocData docData = getNextDocData();
+    Document doc = createDocument(docData,0,-1);
+    return doc;
+  }
+
+  // create a doc
+  // use only part of the body, modify it to keep the rest (or use all if size==0).
+  // reset the docdata properties so they are not added more than once.
+  private Document createDocument(DocData docData, int size, int cnt) {
+    int docid = incrNumDocsCreated();
+    Document doc = new Document();
+    doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
+    if (docData.name!=null) {
+      String name = (cnt<0 ? docData.name : docData.name+"_"+cnt);
+      doc.add(new Field("docname", name, storeVal, indexVal, termVecVal));
+    }
+    if (docData.date!=null) {
+      String dateStr = DateTools.dateToString(docData.date, DateTools.Resolution.SECOND);
+      doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal));
+    }
+    if (docData.title!=null) {
+      doc.add(new Field("doctitle", docData.title, storeVal, indexVal, termVecVal));
+    }
+    if (docData.body!=null && docData.body.length()>0) {
+      String bdy;
+      if (size<=0 || size>=docData.body.length()) {
+        bdy = docData.body; // use all
+        docData.body = "";  // nothing left
+      } else {
+        // attempt not to break words - if whitespace found within next 20 chars...
+        for (int n=size-1; n<size+20 && n<docData.body.length(); n++) {
+          if (Character.isWhitespace(docData.body.charAt(n))) {
+            size = n;
+            break;
+          }
+        }
+        bdy = docData.body.substring(0,size); // use part
+        docData.body = docData.body.substring(size); // some left
+      }
+      doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
+    }
+    if (docData.props!=null) {
+      for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {
+        String key = (String) it.next();
+        String val = (String) docData.props.get(key);
+        doc.add(new Field(key, val, storeVal, indexVal, termVecVal));
+      }
+      docData.props = null;
+    }
+    //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
+    return doc;
+  }
+
+  /*
+   *  (non-Javadoc)
+   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument(int)
+   */
+  public Document makeDocument(int size) throws Exception {
+    LeftOver lvr = (LeftOver) leftovr.get();
+    if (lvr==null || lvr.docdata==null || lvr.docdata.body==null || lvr.docdata.body.length()==0) {
+      resetLeftovers();
+    }
+    DocData dd = (lvr==null ? getNextDocData() : lvr.docdata);
+    int cnt = (lvr==null ? 0 : lvr.cnt);
+    while (dd.body==null || dd.body.length()<size) {
+      DocData dd2 = dd;
+      dd = getNextDocData();
+      cnt = 0;
+      dd.body = dd2.body + dd.body;
+    }
+    Document doc = createDocument(dd,size,cnt);
+    if (dd.body==null || dd.body.length()==0) {
+      resetLeftovers();
+    } else {
+      if (lvr == null) {
+        lvr = new LeftOver();
+        leftovr.set(lvr);
+      }
+      lvr.docdata = dd;
+      lvr.cnt = ++cnt;
+    }
+    return doc;
+  }
+
+  private void resetLeftovers() {
+    leftovr.set(null);
+  }
+
+  /* (non-Javadoc)
+   * @see DocMaker#setConfig(java.util.Properties)
+   */
+  public void setConfig(Config config) {
+    this.config = config;
+    boolean stored = config.get("doc.stored",false); 
+    boolean tokenized = config.get("doc.tokenized",true);
+    boolean termVec = config.get("doc.term.vector",false);
+    storeVal = (stored ? Field.Store.YES : Field.Store.NO);
+    indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
+    termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
+  }
+
+  /*
+   *  (non-Javadoc)
+   * @see DocMaker#resetIinputs()
+   */
+  public synchronized void resetInputs() {
+    printDocStatistics();
+    numBytes = 0;
+    numDocsCreated = 0;
+    resetLeftovers();
+  }
+
+  /*
+   *  (non-Javadoc)
+   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#numUniqueBytes()
+   */
+  public long numUniqueBytes() {
+    return numUniqueBytes;
+  }
+
+  /*
+   *  (non-Javadoc)
+   * @see DocMaker#getCount()
+   */
+  public synchronized int getCount() {
+    return numDocsCreated;
+  }
+
+  /*
+   *  (non-Javadoc)
+   * @see DocMaker#getByteCount()
+   */
+  public synchronized long getByteCount() {
+    return numBytes;
+  }
+
+  protected void addUniqueBytes (long n) {
+    numUniqueBytes += n;
+  }
+  
+  protected synchronized void addBytes (long n) {
+    numBytes += n;
+  }
+
+  /*
+   *  (non-Javadoc)
+   * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#printDocStatistics()
+   */
+  private int lastPrintedNumUniqueTexts = 0;
+  private long lastPrintedNumUniqueBytes = 0;
+  private int printNum = 0;
+  public void printDocStatistics() {
+    boolean print = false;
+    String col = "                  ";
+    StringBuffer sb = new StringBuffer();
+    String newline = System.getProperty("line.separator");
+    sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline);
+    int nut = numUniqueTexts();
+    if (nut > lastPrintedNumUniqueTexts) {
+      print = true;
+      sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline);
+      lastPrintedNumUniqueTexts = nut;
+    }
+    long nub = numUniqueBytes();
+    if (nub > lastPrintedNumUniqueBytes) {
+      print = true;
+      sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline);
+      lastPrintedNumUniqueBytes = nub;
+    }
+    if (getCount()>0) {
+      print = true;
+      sb.append("num docs added since last inputs reset:   ").append(Format.format(0,getCount(),col)).append(newline);
+      sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getByteCount(),col)).append(newline);
+    }
+    if (print) {
+      System.out.println(sb.append(newline).toString());
+      printNum++;
+    }
+  }
+
+  protected void collectFiles(File f, ArrayList inputFiles) {
+    if (!f.canRead()) {
+      return;
+    }
+    if (f.isDirectory()) {
+      File files[] = f.listFiles();
+      for (int i = 0; i < files.length; i++) {
+        collectFiles(files[i],inputFiles);
+      }
+      return;
+    }
+    inputFiles.add(f);
+    addUniqueBytes(f.length());
+  }
+
+
+}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
new file mode 100644
index 00000000000..d85f8cc3420
--- /dev/null
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java
@@ -0,0 +1,210 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Locale;
+import java.util.Properties;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.demo.html.HTMLParser;
+
+
+/**
+ * A DocMaker using the (compressed) Trec collection for its input.
+ */
+public class TrecDocMaker extends BasicDocMaker {
+
+  private static final String newline = System.getProperty("line.separator");
+  
+  private DateFormat dateFormat;
+  private File dataDir = null;
+  private ArrayList inputFiles = new ArrayList();
+  private int nextFile = 0;
+  private int iteration=0;
+  private BufferedReader reader;
+  private GZIPInputStream zis;
+  
+  /* (non-Javadoc)
+   * @see SimpleDocMaker#setConfig(java.util.Properties)
+   */
+  public void setConfig(Config config) {
+    super.setConfig(config);
+    String d = config.get("docs.dir","trec");
+    dataDir = new File(new File("work"),d);
+    collectFiles(dataDir,inputFiles);
+    if (inputFiles.size()==0) {
+      throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
+    }
+    // date format: 30-MAR-1987 14:22:36.87
+    dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US);  //Tue, 09 Dec 2003 22:39:08 GMT
+    dateFormat.setLenient(true);
+  }
+
+  private void openNextFile() throws Exception {
+    closeInputs();
+    int retries = 0;
+    while (retries<20) {
+      File f = null;
+      synchronized (this) {
+        f = (File) inputFiles.get(nextFile++);
+        if (nextFile >= inputFiles.size()) { 
+          // exhausted files, start a new round
+          nextFile = 0;
+          iteration++;
+        }
+      }
+      System.out.println("opening: "+f+" length: "+f.length());
+      try {
+        zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f)));
+        break;
+      } catch (Exception e) {
+        retries++;
+        System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+"  #retries="+retries);
+        continue;
+      }
+    }
+    reader = new BufferedReader(new InputStreamReader(zis));
+  }
+
+  private void closeInputs() {
+    if (zis!=null) {
+      try {
+        zis.close();
+      } catch (IOException e) {
+        System.out.println("closeInputs(): Ingnoring error: "+e);
+        e.printStackTrace();
+      }
+      zis = null;
+    }
+    if (reader!=null) { 
+      try {
+        reader.close();
+      } catch (IOException e) {
+        System.out.println("closeInputs(): Ingnoring error: "+e);
+        e.printStackTrace();
+      }
+      reader = null;
+    }
+  }
+  
+  // read until finding a line that starts with the specified prefix
+  private StringBuffer read (String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws Exception {
+    sb = (sb==null ? new StringBuffer() : sb);
+    String sep = "";
+    while (true) {
+      String line = reader.readLine();
+      if (line==null) {
+        openNextFile();
+        continue;
+      }
+      if (line.startsWith(prefix)) {
+        if (collectMatchLine) {
+          sb.append(sep+line);
+          sep = newline;
+        }
+        break;
+      }
+      if (collectAll) {
+        sb.append(sep+line);
+        sep = newline;
+      }
+    }
+    //System.out.println("read: "+sb);
+    return sb;
+  }
+  
+  protected DocData getNextDocData() throws Exception {
+    if (reader==null) {
+      openNextFile();
+    }
+    // 1. skip until doc start
+    read("<DOC>",null,false,false); 
+    // 2. name
+    StringBuffer sb = read("<DOCNO>",null,true,false);
+    String name = sb.substring("<DOCNO>".length());
+    name = name.substring(0,name.indexOf("</DOCNO>"))+"_"+iteration;
+    // 3. skip until doc header
+    read("<DOCHDR>",null,false,false); 
+    // 4. date
+    sb = read("Date: ",null,true,false);
+    String dateStr = sb.substring("Date: ".length());
+    // 5. skip until end of doc header
+    read("</DOCHDR>",null,false,false); 
+    // 6. collect until end of doc
+    sb = read("</DOC>",null,false,true);
+    // this is the next document, so parse it  
+    HTMLParser p = new HTMLParser(new StringReader(sb.toString()));
+    // title
+    String title = p.getTitle();
+    // properties 
+    Properties props = p.getMetaTags(); 
+    // body
+    Reader r = p.getReader();
+    char c[] = new char[1024];
+    StringBuffer bodyBuf = new StringBuffer();
+    int n;
+    while ((n = r.read(c)) >= 0) {
+      if (n>0) {
+        bodyBuf.append(c,0,n);
+      }
+    }
+    addBytes(bodyBuf.length());
+    
+    DocData dd = new DocData();
+    
+    dd.date = dateFormat.parse(dateStr.trim());
+    dd.name = name;
+    dd.title = title;
+    dd.body = bodyBuf.toString();
+    dd.props = props;
+    return dd;
+  }
+
+
+  /*
+   *  (non-Javadoc)
+   * @see DocMaker#resetIinputs()
+   */
+  public synchronized void resetInputs() {
+    super.resetInputs();
+    closeInputs();
+    nextFile = 0;
+    iteration = 0;
+  }
+
+  /*
+   *  (non-Javadoc)
+   * @see DocMaker#numUniqueTexts()
+   */
+  public int numUniqueTexts() {
+    return inputFiles.size();
+  }
+
+}