From 02dd452026d1fa904e495105d0213f9d0d2f3cdd Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Wed, 25 Jul 2007 08:54:58 +0000 Subject: [PATCH] LUCENE-947: add creation of & indexing from 'one document per line' text files to minimize IO overhead of creating documents when running tests git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@559366 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/benchmark/CHANGES.txt | 5 + contrib/benchmark/conf/createLineFile.alg | 43 +++++ contrib/benchmark/conf/indexLineFile.alg | 53 ++++++ .../lucene/benchmark/byTask/PerfRunData.java | 10 ++ .../benchmark/byTask/feeds/BasicDocMaker.java | 33 +++- .../benchmark/byTask/feeds/DirDocMaker.java | 20 ++- .../byTask/feeds/FileBasedQueryMaker.java | 2 +- .../benchmark/byTask/feeds/LineDocMaker.java | 159 ++++++++++++++++++ .../byTask/feeds/ReutersQueryMaker.java | 4 +- .../byTask/feeds/SimpleQueryMaker.java | 6 +- .../lucene/benchmark/byTask/package.html | 4 + .../benchmark/byTask/tasks/AddDocTask.java | 6 +- .../byTask/tasks/CreateIndexTask.java | 11 +- .../benchmark/byTask/tasks/OpenIndexTask.java | 12 +- .../byTask/tasks/WriteLineDocTask.java | 137 +++++++++++++++ .../lucene/benchmark/byTask/utils/Config.java | 6 +- .../benchmark/byTask/TestPerfTasksLogic.java | 68 ++++++++ 17 files changed, 553 insertions(+), 26 deletions(-) create mode 100644 contrib/benchmark/conf/createLineFile.alg create mode 100644 contrib/benchmark/conf/indexLineFile.alg create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java create mode 100644 contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt index 7e20b421831..e50d747585f 100644 --- a/contrib/benchmark/CHANGES.txt +++ b/contrib/benchmark/CHANGES.txt @@ -4,6 +4,11 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety $Id:$ +7/24/07 + LUCENE-947: Add support for creating and index "one document per + line" from a large text file, which reduces per-document overhead of + opening a single file for each document. + 6/30/07 LUCENE-848: Added support for Wikipedia benchmarking. diff --git a/contrib/benchmark/conf/createLineFile.alg b/contrib/benchmark/conf/createLineFile.alg new file mode 100644 index 00000000000..dae602811ea --- /dev/null +++ b/contrib/benchmark/conf/createLineFile.alg @@ -0,0 +1,43 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- + +# +# This alg will process the Reuters documents feed to produce a +# single file that contains all documents, one per line. +# +# To use this, first cd to contrib/benchmark and then run: +# +# ant run-task -Dtask.alg=conf/createLineFile.alg +# +# Then, to index the documents in the line file, see +# indexLineFile.alg. +# + +# Where to get documents from: +doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker + +# Where to write the line file output: +line.file.out=work/reuters.lines.txt + +# Stop after processing the document feed once: +doc.maker.forever=false + +# ------------------------------------------------------------------------------------- + +# Process all documents, appending each one to the line file: +{WriteLineDoc()}: * diff --git a/contrib/benchmark/conf/indexLineFile.alg b/contrib/benchmark/conf/indexLineFile.alg new file mode 100644 index 00000000000..52c4af1e0c6 --- /dev/null +++ b/contrib/benchmark/conf/indexLineFile.alg @@ -0,0 +1,53 @@ +#/** +# * Licensed to the Apache Software Foundation (ASF) under one or more +# * contributor license agreements. See the NOTICE file distributed with +# * this work for additional information regarding copyright ownership. +# * The ASF licenses this file to You under the Apache License, Version 2.0 +# * (the "License"); you may not use this file except in compliance with +# * the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ +# ------------------------------------------------------------------------------------- + +# +# This file indexes documents contained in a single text file, one per +# line. See createLineFile.alg for how to create this file. The +# benefit of this is it removes the IO cost of opening one file per +# document to let you more accurately measure time spent analyzing and +# indexing your documents vs time spent creating the documents. +# +# To use this, you must first run the createLineFile.alg, then cd to +# contrib/benchmark and then run: +# +# ant run-task -Dtask.alg=conf/indexLineFile.alg +# + +analyzer=org.apache.lucene.analysis.SimpleAnalyzer + +# Feed that knows how to process the line file format: +doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker + +# File that contains one document per line: +docs.file=work/reuters.lines.txt + +# Process documents only once: +doc.maker.forever=false + +# ------------------------------------------------------------------------------------- + +# Reset the system, create a new index, index all docs from the line +# file, close the index, produce a report. + +ResetSystemErase +CreateIndex +{AddDoc}: * +CloseIndex + +RepSumByPref AddDoc diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java index 9e6bd9ba1ae..8af30f16e27 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java @@ -70,6 +70,7 @@ public class PerfRunData { private IndexReader indexReader; private IndexWriter indexWriter; private Config config; + private long startTimeMillis; // constructor public PerfRunData (Config config) throws Exception { @@ -136,6 +137,15 @@ public class PerfRunData { // release unused stuff System.runFinalization(); System.gc(); + + startTimeMillis = System.currentTimeMillis(); + } + + /** + * @return Start time in milliseconds + */ + public long getStartTimeMillis() { + return startTimeMillis; } /** diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java index 6e16f33b152..1fe45a01a25 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java @@ -39,6 +39,8 @@ import java.util.Iterator; * doc.stored=true|FALSE
* doc.tokenized=TRUE|false
* doc.term.vector=true|FALSE
+ * doc.term.vector.positions=true|FALSE
+ * doc.term.vector.offsets=true|FALSE
* doc.store.body.bytes=true|FALSE //Store the body contents raw UTF-8 bytes as a field
*/ public abstract class BasicDocMaker implements DocMaker { @@ -55,7 +57,13 @@ public abstract class BasicDocMaker implements DocMaker { // leftovers are thread local, because it is unsafe to share residues between threads private ThreadLocal leftovr = new ThreadLocal(); - static final String BODY_FIELD = "body"; + public static final String BODY_FIELD = "body"; + public static final String TITLE_FIELD = "doctitle"; + public static final String DATE_FIELD = "docdate"; + public static final String ID_FIELD = "docid"; + public static final String BYTES_FIELD = "bytes"; + public static final String NAME_FIELD = "docname"; + private long numBytes = 0; private long numUniqueBytes = 0; @@ -97,17 +105,17 @@ public abstract class BasicDocMaker implements DocMaker { private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException { int docid = incrNumDocsCreated(); Document doc = new Document(); - doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal)); + doc.add(new Field(ID_FIELD, "doc"+docid, storeVal, indexVal, termVecVal)); if (docData.getName()!=null) { String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt); - doc.add(new Field("docname", name, storeVal, indexVal, termVecVal)); + doc.add(new Field(NAME_FIELD, name, storeVal, indexVal, termVecVal)); } if (docData.getDate()!=null) { String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND); - doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal)); + doc.add(new Field(DATE_FIELD, dateStr, storeVal, indexVal, termVecVal)); } if (docData.getTitle()!=null) { - doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal)); + doc.add(new Field(TITLE_FIELD, docData.getTitle(), storeVal, indexVal, termVecVal)); } if (docData.getBody()!=null && docData.getBody().length()>0) { String bdy; @@ -127,7 +135,7 @@ public abstract class BasicDocMaker implements DocMaker { } doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal)); if (storeBytes == true) { - doc.add(new Field("bytes", bdy.getBytes("UTF-8"), Field.Store.YES)); + doc.add(new Field(BYTES_FIELD, bdy.getBytes("UTF-8"), Field.Store.YES)); } } @@ -188,7 +196,18 @@ public abstract class BasicDocMaker implements DocMaker { boolean termVec = config.get("doc.term.vector",false); storeVal = (stored ? Field.Store.YES : Field.Store.NO); indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED); - termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO); + boolean termVecPositions = config.get("doc.term.vector.positions",false); + boolean termVecOffsets = config.get("doc.term.vector.offsets",false); + if (termVecPositions && termVecOffsets) + termVecVal = Field.TermVector.WITH_POSITIONS_OFFSETS; + else if (termVecPositions) + termVecVal = Field.TermVector.WITH_POSITIONS; + else if (termVecOffsets) + termVecVal = Field.TermVector.WITH_OFFSETS; + else if (termVec) + termVecVal = Field.TermVector.YES; + else + termVecVal = Field.TermVector.NO; storeBytes = config.get("doc.store.body.bytes", false); forever = config.get("doc.maker.forever",true); } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java index 8c7861cba9d..651a828eed7 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java @@ -40,7 +40,7 @@ import java.util.Stack; */ public class DirDocMaker extends BasicDocMaker { - private DateFormat dateFormat; + private ThreadLocal dateFormat = new ThreadLocal(); private File dataDir = null; private int iteration=0; @@ -148,11 +148,21 @@ public class DirDocMaker extends BasicDocMaker { if (inputFiles==null) { throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath()); } - // date format: 30-MAR-1987 14:22:36 - dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss",Locale.US); - dateFormat.setLenient(true); } + // get/initiate a thread-local simple date format (must do so + // because SimpleDateFormat is not thread-safe). + protected DateFormat getDateFormat () { + DateFormat df = (DateFormat) dateFormat.get(); + if (df == null) { + // date format: 30-MAR-1987 14:22:36.87 + df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US); + df.setLenient(true); + dateFormat.set(df); + } + return df; + } + protected DocData getNextDocData() throws Exception { File f = null; String name = null; @@ -184,7 +194,7 @@ public class DirDocMaker extends BasicDocMaker { reader.close(); addBytes(f.length()); - Date date = dateFormat.parse(dateStr.trim()); + Date date = getDateFormat().parse(dateStr.trim()); return new DocData(name, bodyBuf.toString(), title, null, date); } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java index 4956acca8e3..b357f676760 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java @@ -46,7 +46,7 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake Analyzer anlzr = (Analyzer) Class.forName(config.get("analyzer", "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance(); - String defaultField = config.get("file.query.maker.default.field", "body"); + String defaultField = config.get("file.query.maker.default.field", BasicDocMaker.BODY_FIELD); QueryParser qp = new QueryParser(defaultField, anlzr); List qq = new ArrayList(); diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java new file mode 100644 index 00000000000..0bdc6787626 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java @@ -0,0 +1,159 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.FileReader; + +/** + * A DocMaker reading one line at a time as a Document from + * a single file. This saves IO cost (over DirDocMaker) of + * recursing through a directory and opening a new file for + * every document. It also re-uses its Document and Field + * instance to improve indexing speed. + * + * Config properties: + * docs.file=<path to the file%gt; + */ +public class LineDocMaker extends BasicDocMaker { + + private BufferedReader fileIn; + private ThreadLocal docState = new ThreadLocal(); + private String fileName; + + private static int READER_BUFFER_BYTES = 64*1024; + + private class DocState { + Document doc; + Field bodyField; + Field titleField; + Field dateField; + + public DocState() { + + bodyField = new Field(BasicDocMaker.BODY_FIELD, + "", + storeVal, + Field.Index.TOKENIZED, + termVecVal); + titleField = new Field(BasicDocMaker.TITLE_FIELD, + "", + storeVal, + Field.Index.TOKENIZED, + termVecVal); + dateField = new Field(BasicDocMaker.TITLE_FIELD, + "", + storeVal, + Field.Index.TOKENIZED, + termVecVal); + + doc = new Document(); + doc.add(bodyField); + doc.add(titleField); + doc.add(dateField); + } + + final static String SEP = WriteLineDocTask.SEP; + + public Document setFields(String line) { + // title date body + int spot = line.indexOf(SEP); + titleField.setValue(line.substring(0, spot)); + int spot2 = line.indexOf(SEP, 1+spot); + dateField.setValue(line.substring(1+spot, spot2)); + bodyField.setValue(line.substring(1+spot2, line.length())); + return doc; + } + } + + /* (non-Javadoc) + * @see SimpleDocMaker#setConfig(java.util.Properties) + */ + public void setConfig(Config config) { + super.setConfig(config); + resetInputs(); + } + + protected DocData getNextDocData() throws Exception { + throw new RuntimeException("not implemented"); + } + + private DocState getDocState() { + DocState ds = (DocState) docState.get(); + if (ds == null) { + ds = new DocState(); + docState.set(ds); + } + return ds; + } + + public Document makeDocument() throws Exception { + + String line; + synchronized(this) { + while(true) { + line = fileIn.readLine(); + if (line == null) { + if (!forever) + throw new NoMoreDataException(); + else { + // Reset the file + openFile(); + } + } else { + break; + } + } + } + + return getDocState().setFields(line); + } + + public Document makeDocument(int size) throws Exception { + throw new RuntimeException("cannot change document size with LineDocMaker; please use DirDocMaker instead"); + } + + public synchronized void resetInputs() { + super.resetInputs(); + fileName = config.get("docs.file", null); + if (fileName == null) + throw new RuntimeException("docs.file must be set"); + openFile(); + } + + private void openFile() { + try { + if (fileIn != null) + fileIn.close(); + fileIn = new BufferedReader(new FileReader(fileName), READER_BUFFER_BYTES); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public int numUniqueTexts() { + return -1; + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java index 5c2d940e5b2..0b0081b4e44 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java @@ -71,7 +71,7 @@ public class ReutersQueryMaker extends AbstractQueryMaker implements QueryMaker * @return array of Lucene queries */ private static Query[] createQueries(List qs, Analyzer a) { - QueryParser qp = new QueryParser("body", a); + QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD, a); List queries = new ArrayList(); for (int i = 0; i < qs.size(); i++) { try { @@ -107,7 +107,7 @@ public class ReutersQueryMaker extends AbstractQueryMaker implements QueryMaker List queryList = new ArrayList(20); queryList.addAll(Arrays.asList(STANDARD_QUERIES)); - queryList.addAll(Arrays.asList(getPrebuiltQueries("body"))); + queryList.addAll(Arrays.asList(getPrebuiltQueries(BasicDocMaker.BODY_FIELD))); return createQueries(queryList, anlzr); } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java index 6468e2e0708..638fbd01438 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java @@ -45,11 +45,11 @@ public class SimpleQueryMaker extends AbstractQueryMaker implements QueryMaker { Analyzer anlzr= (Analyzer) Class.forName(config.get("analyzer", "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance(); - QueryParser qp = new QueryParser("body",anlzr); + QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD,anlzr); ArrayList qq = new ArrayList(); - Query q1 = new TermQuery(new Term("docid","doc2")); + Query q1 = new TermQuery(new Term(BasicDocMaker.ID_FIELD,"doc2")); qq.add(q1); - Query q2 = new TermQuery(new Term("body","simple")); + Query q2 = new TermQuery(new Term(BasicDocMaker.BODY_FIELD,"simple")); qq.add(q2); BooleanQuery bq = new BooleanQuery(); bq.add(q1,Occur.MUST); diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html index ea5bc00fd68..a19c6baee06 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html @@ -519,6 +519,8 @@ Here is a list of currently defined properties:
  • doc.stored
  • doc.tokenized
  • doc.term.vector +
  • doc.term.vector.positions +
  • doc.term.vector.offsets
  • doc.store.body.bytes
  • docs.dir
  • query.maker @@ -540,6 +542,8 @@ Here is a list of currently defined properties:
  • merge.factor
  • max.buffered
  • directory +
  • ram.flush.mb +
  • autocommit
  • diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java index e7c023d249d..4af39963641 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java @@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.document.Document; +import java.text.NumberFormat; /** @@ -81,7 +82,10 @@ public class AddDocTask extends PerfTask { logStep = getRunData().getConfig().get("doc.add.log.step",DEFAULT_ADD_DOC_LOG_STEP); } if (logStep>0 && (count%logStep)==0) { - System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs"); + double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0; + NumberFormat nf = NumberFormat.getInstance(); + nf.setMaximumFractionDigits(2); + System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs"); } } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java index 5093012678c..4c3cd48fccd 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java @@ -30,7 +30,8 @@ import java.io.IOException; * Create an index. *
    Other side effects: index writer object in perfRunData is set. *
    Relevant properties: merge.factor, max.buffered, - * max.field.length. + * max.field.length, ram.flush.mb [default 0], autocommit + * [default true]. */ public class CreateIndexTask extends PerfTask { @@ -42,19 +43,23 @@ public class CreateIndexTask extends PerfTask { Directory dir = getRunData().getDirectory(); Analyzer analyzer = getRunData().getAnalyzer(); - IndexWriter iw = new IndexWriter(dir, analyzer, true); - Config config = getRunData().getConfig(); boolean cmpnd = config.get("compound",true); int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR); int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED); int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH); + double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB); + boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT); + IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true); + iw.setUseCompoundFile(cmpnd); iw.setMergeFactor(mrgf); iw.setMaxBufferedDocs(mxbf); iw.setMaxFieldLength(mxfl); + if (flushAtRAMUsage > 0) + iw.setRAMBufferSizeMB(flushAtRAMUsage); getRunData().setIndexWriter(iw); return 1; diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java index e60edb86f9d..7e1b1b2ea1b 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java @@ -30,14 +30,16 @@ import java.io.IOException; * Open an index writer. *
    Other side effects: index writer object in perfRunData is set. *
    Relevant properties: merge.factor, max.buffered, - * max.field.length. -. + * max.field.length, ram.flush.mb [default 0], autocommit + * [default true]. */ public class OpenIndexTask extends PerfTask { public static final int DEFAULT_MAX_BUFFERED = 10; public static final int DEFAULT_MAX_FIELD_LENGTH = 10000; public static final int DEFAULT_MERGE_PFACTOR = 10; + public static final int DEFAULT_RAM_FLUSH_MB = 0; + public static final boolean DEFAULT_AUTO_COMMIT = true; public OpenIndexTask(PerfRunData runData) { super(runData); @@ -46,7 +48,6 @@ public class OpenIndexTask extends PerfTask { public int doLogic() throws IOException { Directory dir = getRunData().getDirectory(); Analyzer analyzer = getRunData().getAnalyzer(); - IndexWriter writer = new IndexWriter(dir, analyzer, false); Config config = getRunData().getConfig(); @@ -54,12 +55,17 @@ public class OpenIndexTask extends PerfTask { int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR); int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED); int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH); + double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB); + boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT); + IndexWriter writer = new IndexWriter(dir, autoCommit, analyzer, false); // must update params for newly opened writer writer.setMaxBufferedDocs(mxbf); writer.setMaxFieldLength(mxfl); writer.setMergeFactor(mrgf); writer.setUseCompoundFile(cmpnd); // this one redundant? + if (flushAtRAMUsage > 0) + writer.setRAMBufferSizeMB(flushAtRAMUsage); getRunData().setIndexWriter(writer); return 1; diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java new file mode 100644 index 00000000000..97c57c11ee0 --- /dev/null +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java @@ -0,0 +1,137 @@ +package org.apache.lucene.benchmark.byTask.tasks; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedWriter; +import java.io.FileWriter; + +import org.apache.lucene.benchmark.byTask.PerfRunData; +import org.apache.lucene.benchmark.byTask.feeds.DocMaker; +import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker; +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + + +public class WriteLineDocTask extends PerfTask { + + /** + * Default value for property doc.add.log.step - indicating how often + * an "added N docs" message should be logged. + */ + public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000; + + public WriteLineDocTask(PerfRunData runData) { + super(runData); + } + + private int logStep = -1; + private int docSize = 0; + int count = 0; + private BufferedWriter lineFileOut=null; + private DocMaker docMaker; + + public final static String SEP = "\t"; + + /* + * (non-Javadoc) + * @see PerfTask#setup() + */ + public void setup() throws Exception { + super.setup(); + if (lineFileOut==null) { + Config config = getRunData().getConfig(); + String fileName = config.get("line.file.out", null); + if (fileName == null) + throw new Exception("line.file.out must be set"); + lineFileOut = new BufferedWriter(new FileWriter(fileName)); + } + docMaker = getRunData().getDocMaker(); + } + + public void tearDown() throws Exception { + log(++count); + super.tearDown(); + } + + public int doLogic() throws Exception { + Document doc; + if (docSize > 0) { + doc = docMaker.makeDocument(docSize); + } else { + doc = docMaker.makeDocument(); + } + + Field f = doc.getField(BasicDocMaker.BODY_FIELD); + + String body, title, date; + if (f != null) + body = f.stringValue().replace('\t', ' '); + else + body = null; + + f = doc.getField(BasicDocMaker.TITLE_FIELD); + if (f != null) + title = f.stringValue().replace('\t', ' '); + else + title = ""; + + f = doc.getField(BasicDocMaker.DATE_FIELD); + if (f != null) + date = f.stringValue().replace('\t', ' '); + else + date = ""; + + if (body != null) { + lineFileOut.write(title, 0, title.length()); + lineFileOut.write(SEP); + lineFileOut.write(date, 0, date.length()); + lineFileOut.write(SEP); + lineFileOut.write(body, 0, body.length()); + lineFileOut.newLine(); + lineFileOut.flush(); + } + return 1; + } + + private void log (int count) { + if (logStep<0) { + // init once per instance + logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP); + } + if (logStep>0 && (count%logStep)==0) { + System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs"); + } + } + + /** + * Set the params (docSize only) + * @param params docSize, or 0 for no limit. + */ + public void setParams(String params) { + super.setParams(params); + docSize = (int) Float.parseFloat(params); + } + + /* (non-Javadoc) + * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() + */ + public boolean supportsParams() { + return true; + } +} diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java index af6f0b161bd..798786b53b9 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java @@ -22,6 +22,8 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; +import java.util.List; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Properties; @@ -110,7 +112,9 @@ public class Config { private void printProps() { System.out.println("------------> config properties:"); - for (Iterator it = props.keySet().iterator(); it.hasNext();) { + List propKeys = new ArrayList(props.keySet()); + Collections.sort(propKeys); + for (Iterator it = propKeys.iterator(); it.hasNext();) { String propName = (String) it.next(); System.out.println(propName + " = " + props.getProperty(propName)); } diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java index 907c9902e76..2cd86d4641f 100755 --- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java +++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java @@ -18,6 +18,9 @@ package org.apache.lucene.benchmark.byTask; import java.io.StringReader; +import java.io.File; +import java.io.FileReader; +import java.io.BufferedReader; import org.apache.lucene.benchmark.byTask.Benchmark; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; @@ -79,6 +82,7 @@ public class TestPerfTasksLogic extends TestCase { iw.close(); IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs()); + ir.close(); } /** @@ -121,6 +125,7 @@ public class TestPerfTasksLogic extends TestCase { iw.close(); IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs()); + ir.close(); } /** @@ -150,6 +155,69 @@ public class TestPerfTasksLogic extends TestCase { IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 21578; // that's how many docs there are in the Reuters collecton. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); + ir.close(); + } + + /** + * Test WriteLineDoc and LineDocMaker. + */ + public void testLineDocFile() throws Exception { + File lineFile = new File(System.getProperty("tempDir"), "test.reuters.lines.txt"); + + // We will call WriteLineDocs this many times + final int NUM_TRY_DOCS = 500; + + // Creates a line file with first 500 docs from reuters + String algLines1[] = { + "# ----- properties ", + "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker", + "doc.maker.forever=false", + "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'), + "# ----- alg ", + "{WriteLineDoc()}:" + NUM_TRY_DOCS, + }; + + // Run algo + Benchmark benchmark = execBenchmark(algLines1); + + // Verify we got somewhere between 1-500 lines (some + // Reuters docs have no body, which WriteLineDoc task + // skips). + BufferedReader r = new BufferedReader(new FileReader(lineFile)); + int numLines = 0; + while(r.readLine() != null) + numLines++; + r.close(); + assertTrue("did not see the right number of docs; should be > 0 and <= " + NUM_TRY_DOCS + " but was " + numLines, numLines > 0 && numLines <= NUM_TRY_DOCS); + + // Index the line docs + String algLines2[] = { + "# ----- properties ", + "analyzer=org.apache.lucene.analysis.SimpleAnalyzer", + "doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker", + "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'), + "doc.maker.forever=false", + "autocommit=false", + "ram.flush.mb=4", + "# ----- alg ", + "ResetSystemErase", + "CreateIndex", + "{AddDoc}: *", + "CloseIndex", + }; + + // Run algo + benchmark = execBenchmark(algLines2); + + // now we should be able to open the index for write. + IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false); + iw.close(); + + IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); + assertEquals(numLines + " lines were were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs()); + ir.close(); + + lineFile.delete(); } // create the benchmark and execute it.