From 13eaccfd56bc3f1f5532e8e58b78cc8ecf9b80d4 Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Mon, 12 Feb 2007 13:32:20 +0000 Subject: [PATCH] Lucene 790 git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@506441 13f79535-47bb-0310-9956-ffa450edef68 --- .../benchmark/byTask/feeds/BasicDocMaker.java | 281 ++++++++++++++++++ .../benchmark/byTask/feeds/TrecDocMaker.java | 210 +++++++++++++ 2 files changed, 491 insertions(+) create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java new file mode 100644 index 00000000000..cc8ad4ffaec --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java @@ -0,0 +1,281 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.util.ArrayList; +import java.util.Date; +import java.util.Iterator; +import java.util.Properties; + +import org.apache.lucene.document.DateTools; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.Format; + + +/** + * Create documents for the test. + * Maintains counters of chars etc. so that sub-classes just need to + * provide textual content, and the create-by-size is handled here. + */ +public abstract class BasicDocMaker implements DocMaker { + + private int numDocsCreated = 0; + + static class DocData { + String name; + Date date; + String title; + String body; + Properties props; + } + + private static class LeftOver { + private DocData docdata; + private int cnt; + } + + // leftovers are thread local, because it is unsafe to share residues between threads + private ThreadLocal leftovr = new ThreadLocal(); + + static final String BODY_FIELD = "body"; + private long numBytes = 0; + private long numUniqueBytes = 0; + + protected Config config; + + protected Field.Store storeVal = Field.Store.NO; + protected Field.Index indexVal = Field.Index.TOKENIZED; + protected Field.TermVector termVecVal = Field.TermVector.NO; + + private synchronized int incrNumDocsCreated() { + return numDocsCreated++; + } + + /** + * Return the data of the next document. + * @return data of the next document. + * @exception if cannot create the next doc data + */ + protected abstract DocData getNextDocData() throws Exception; + + /* + * (non-Javadoc) + * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument() + */ + public Document makeDocument () throws Exception { + resetLeftovers(); + DocData docData = getNextDocData(); + Document doc = createDocument(docData,0,-1); + return doc; + } + + // create a doc + // use only part of the body, modify it to keep the rest (or use all if size==0). + // reset the docdata properties so they are not added more than once. + private Document createDocument(DocData docData, int size, int cnt) { + int docid = incrNumDocsCreated(); + Document doc = new Document(); + doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal)); + if (docData.name!=null) { + String name = (cnt<0 ? docData.name : docData.name+"_"+cnt); + doc.add(new Field("docname", name, storeVal, indexVal, termVecVal)); + } + if (docData.date!=null) { + String dateStr = DateTools.dateToString(docData.date, DateTools.Resolution.SECOND); + doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal)); + } + if (docData.title!=null) { + doc.add(new Field("doctitle", docData.title, storeVal, indexVal, termVecVal)); + } + if (docData.body!=null && docData.body.length()>0) { + String bdy; + if (size<=0 || size>=docData.body.length()) { + bdy = docData.body; // use all + docData.body = ""; // nothing left + } else { + // attempt not to break words - if whitespace found within next 20 chars... + for (int n=size-1; n ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum).append("): ").append(newline); + int nut = numUniqueTexts(); + if (nut > lastPrintedNumUniqueTexts) { + print = true; + sb.append("total count of unique texts: ").append(Format.format(0,nut,col)).append(newline); + lastPrintedNumUniqueTexts = nut; + } + long nub = numUniqueBytes(); + if (nub > lastPrintedNumUniqueBytes) { + print = true; + sb.append("total bytes of unique texts: ").append(Format.format(0,nub,col)).append(newline); + lastPrintedNumUniqueBytes = nub; + } + if (getCount()>0) { + print = true; + sb.append("num docs added since last inputs reset: ").append(Format.format(0,getCount(),col)).append(newline); + sb.append("total bytes added since last inputs reset: ").append(Format.format(0,getByteCount(),col)).append(newline); + } + if (print) { + System.out.println(sb.append(newline).toString()); + printNum++; + } + } + + protected void collectFiles(File f, ArrayList inputFiles) { + if (!f.canRead()) { + return; + } + if (f.isDirectory()) { + File files[] = f.listFiles(); + for (int i = 0; i < files.length; i++) { + collectFiles(files[i],inputFiles); + } + return; + } + inputFiles.add(f); + addUniqueBytes(f.length()); + } + + +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java new file mode 100644 index 00000000000..d85f8cc3420 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocMaker.java @@ -0,0 +1,210 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Locale; +import java.util.Properties; +import java.util.zip.GZIPInputStream; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.demo.html.HTMLParser; + + +/** + * A DocMaker using the (compressed) Trec collection for its input. + */ +public class TrecDocMaker extends BasicDocMaker { + + private static final String newline = System.getProperty("line.separator"); + + private DateFormat dateFormat; + private File dataDir = null; + private ArrayList inputFiles = new ArrayList(); + private int nextFile = 0; + private int iteration=0; + private BufferedReader reader; + private GZIPInputStream zis; + + /* (non-Javadoc) + * @see SimpleDocMaker#setConfig(java.util.Properties) + */ + public void setConfig(Config config) { + super.setConfig(config); + String d = config.get("docs.dir","trec"); + dataDir = new File(new File("work"),d); + collectFiles(dataDir,inputFiles); + if (inputFiles.size()==0) { + throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); + } + // date format: 30-MAR-1987 14:22:36.87 + dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US); //Tue, 09 Dec 2003 22:39:08 GMT + dateFormat.setLenient(true); + } + + private void openNextFile() throws Exception { + closeInputs(); + int retries = 0; + while (retries<20) { + File f = null; + synchronized (this) { + f = (File) inputFiles.get(nextFile++); + if (nextFile >= inputFiles.size()) { + // exhausted files, start a new round + nextFile = 0; + iteration++; + } + } + System.out.println("opening: "+f+" length: "+f.length()); + try { + zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f))); + break; + } catch (Exception e) { + retries++; + System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries); + continue; + } + } + reader = new BufferedReader(new InputStreamReader(zis)); + } + + private void closeInputs() { + if (zis!=null) { + try { + zis.close(); + } catch (IOException e) { + System.out.println("closeInputs(): Ingnoring error: "+e); + e.printStackTrace(); + } + zis = null; + } + if (reader!=null) { + try { + reader.close(); + } catch (IOException e) { + System.out.println("closeInputs(): Ingnoring error: "+e); + e.printStackTrace(); + } + reader = null; + } + } + + // read until finding a line that starts with the specified prefix + private StringBuffer read (String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws Exception { + sb = (sb==null ? new StringBuffer() : sb); + String sep = ""; + while (true) { + String line = reader.readLine(); + if (line==null) { + openNextFile(); + continue; + } + if (line.startsWith(prefix)) { + if (collectMatchLine) { + sb.append(sep+line); + sep = newline; + } + break; + } + if (collectAll) { + sb.append(sep+line); + sep = newline; + } + } + //System.out.println("read: "+sb); + return sb; + } + + protected DocData getNextDocData() throws Exception { + if (reader==null) { + openNextFile(); + } + // 1. skip until doc start + read("",null,false,false); + // 2. name + StringBuffer sb = read("",null,true,false); + String name = sb.substring("".length()); + name = name.substring(0,name.indexOf(""))+"_"+iteration; + // 3. skip until doc header + read("",null,false,false); + // 4. date + sb = read("Date: ",null,true,false); + String dateStr = sb.substring("Date: ".length()); + // 5. skip until end of doc header + read("",null,false,false); + // 6. collect until end of doc + sb = read("",null,false,true); + // this is the next document, so parse it + HTMLParser p = new HTMLParser(new StringReader(sb.toString())); + // title + String title = p.getTitle(); + // properties + Properties props = p.getMetaTags(); + // body + Reader r = p.getReader(); + char c[] = new char[1024]; + StringBuffer bodyBuf = new StringBuffer(); + int n; + while ((n = r.read(c)) >= 0) { + if (n>0) { + bodyBuf.append(c,0,n); + } + } + addBytes(bodyBuf.length()); + + DocData dd = new DocData(); + + dd.date = dateFormat.parse(dateStr.trim()); + dd.name = name; + dd.title = title; + dd.body = bodyBuf.toString(); + dd.props = props; + return dd; + } + + + /* + * (non-Javadoc) + * @see DocMaker#resetIinputs() + */ + public synchronized void resetInputs() { + super.resetInputs(); + closeInputs(); + nextFile = 0; + iteration = 0; + } + + /* + * (non-Javadoc) + * @see DocMaker#numUniqueTexts() + */ + public int numUniqueTexts() { + return inputFiles.size(); + } + +}